In [10]:
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [11]:
# My own simple feature extraction function
def hashfeatures1(name,debug=False):
    """
    Input:
        name : a string representing the person's name to be hashed
    
    Output:
        v: a feature vector representing the input string
        v[0] = begins with vowel
        v[1] = ends with vowel
        v[2] = ends in 'a'
        v[3] = ends in 'y'
    """
    d = 4 # d: the number of dimensions to be in the feature vector
    v = np.zeros(d)
    
    if name[0] == 'A' or name[0] == 'E' or name[0] == 'I' or name[0] == 'O' or name[0] == 'U' or name[0] == 'Y':
        v[0] = 1
    if name[-1] == 'a' or name[-1] == 'e' or name[-1] == 'i' or name[-1] == 'o' or name[-1] == 'u' or name[-1] == 'y':
        v[1] = 1
    if name[-1] == 'a':
        v[2] = 1
    if name[-1] == 'y':
        v[3] = 1
    if debug:
        print(f"Feature vector for {name}:\n{v.astype(int)}\n")
    return v

In [12]:
# This function opens a file and calls the hashfeatures function on it to convert its contents into feature vectors
def name2features(filename, d=128, FIX=3, LoadFile=True):
    """
    Output:
        X : n feature vectors of dimension d, (nxd)
    """
    
    # read in baby names
    if LoadFile:
        with open(filename, 'r') as f:
            names = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        names = filename.split('\n')
    n = len(names)
    X = np.zeros((n, d))
    for i in range(n):
        X[i,:] = hashfeatures1(names[i])
    #print(X)
    return (X, n)

In [13]:
girlsTrain = "GirlsTrain.txt"
boysTrain = "BoysTrain.txt"

girlfeatures, numgirls = name2features(girlsTrain,4)
girllabels = np.ones(numgirls) * -1
boyfeatures, numboys = name2features(boysTrain,4)
boylabels = np.ones(numboys)

allLabels = np.concatenate([girllabels,boylabels])
allfeatures = np.concatenate([girlfeatures,boyfeatures])

In [14]:
def naivebayesPY(X, Y):
    """
    naivebayesPY(X, Y) returns [pos,neg]

    Computation of P(Y)
    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (nx1)

    Output:
        pos: probability p(y=1)
        neg: probability p(y=-1)
    """
    
    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    Y = np.concatenate([Y, [-1,1]])
    n = len(Y)
    arrsum = np.sum(Y)
    
    # SOLUTION - based on system of linear equations, solved for posCount with substitution
    # arrSum = 1(posCount) - 1(negCount)
    # n = posCount + negCount
    
    posCount = (n + arrsum)/2
    
    pos = (posCount)/n
    neg = (n-posCount)/n
    
    #print("Probability of +1 is",pos)
    #print("Probability of -1 is",neg)
    return [pos,neg]

In [18]:
naivebayesPY(allfeatures,allLabels)

[0.4932084309133489, 0.506791569086651]

In [19]:
def naivebayesPXY(X,Y):
    """
    naivebayesPXY(X, Y) returns [posprob,negprob]
    
    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (n)
    
    Output:
        posprob: probability vector of p(x_alpha = 1|y=1)  (d)
            probability that feature d is 1 (true, or observed), given that the observed name is 1 (boy)
        negprob: probability vector of p(x_alpha = 1|y=-1) (d)
            probability that feature d is 1 (true, or observed), given that the observed name is 0 (girl)
    """
    
    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    n, d = X.shape
    X = np.concatenate([X, np.ones((2,d)), np.zeros((2,d))])
    Y = np.concatenate([Y, [-1,1,-1,1]])
    
    # identify observations which are boys (slicing/indexing)
    # count total number of boys
    # across all of the d features, count the number of boy observations that are HOT, divide by total num of boys
    
    # I think this could be accomplished with just a sum, once I use some condtion on the Y vector (==1) to index the 
    # appropriate part of X
    
    # identify observations which are girls(slicing/indexing)
    #print(X)
    #print(Y)
    
    # Boys, Y=1
    boys = X[Y == 1]
    #print(boys)
    b,d = boys.shape
    #print("There are",b,"boys with",d,"features each")
    #Want to sum down the columns
    boys_hot = np.sum(boys,axis=0)
    #print(boys_hot)
    posprob = boys_hot / b
    #print(posprob)
    
    # Girls, Y=-1
    girls = X[Y == -1]
    g,d = girls.shape
    #print("There are",g,"girls with",d,"features each")
    girls_hot = np.sum(girls,axis=0)
    negprob = girls_hot / g
    
    return [posprob,negprob]

In [20]:
naivebayesPXY(allfeatures,allLabels)

[array([0.19639469, 0.29601518, 0.01802657, 0.07305503]),
 array([0.26592798, 0.72299169, 0.37026777, 0.10710988])]

In [None]:
def loglikelihood(posprob, negprob, X_test, Y_test):
    """
    loglikelihood(posprob, negprob, X_test, Y_test) returns loglikelihood of each point in X_test
    
    Input:
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
        Y_test : labels (-1 or +1) (n)
    
    Output:
        loglikelihood of each point in X_test (n)
    """
    
    # BOYS
    # Y = +1
    boys = X_test[Y_test == 1]
    b,d = boys.shape
    
    # GIRLS
    # Y = -1
    girls = X_test[Y_test == -1]
    #print(girls)
    g,d = girls.shape
    

In [None]:
def naivebayes_pred(pos, neg, posprob, negprob, X_test):
    """
    naivebayes_pred(pos, neg, posprob, negprob, X_test) returns the prediction of each point in X_test
    
    Input:
        pos: class probability for the negative class
        neg: class probability for the positive class
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
    
    Output:
        prediction of each point in X_test (n)
    """
    # call loglikelihood to find two points of comparison.
    llp,lln = loglikelihood(posprob, negprob, X_test)
    
    # add class prior probability to each of the P(x|Y) vectors
    llp = llp + pos
    lln = lln + neg
    
    # TO DO: compare llp and lln element-wise, returning class of the maximum between the two
    
    # Notes:
    # Instead of loglikelihood returning a 2-tuple of nx1 vectors, it could return an nx2 vector, which would make it
    # easier to make the final prediction (use np.argmax). It would mean adding the class prior probability would
    # be slightly more complicated.