In [1]:
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# My own simple feature extraction function
def hashfeatures1(name,debug=False):
    """
    Input:
        name : a string representing the person's name to be hashed
    
    Output:
        v: a feature vector representing the input string
        v[0] = begins with vowel
        v[1] = ends with vowel
        v[2] = ends in 'a'
        v[3] = ends in 'y'
    """
    d = 4 # d: the number of dimensions to be in the feature vector
    v = np.zeros(d)
    
    if name[0] == 'A' or name[0] == 'E' or name[0] == 'I' or name[0] == 'O' or name[0] == 'U' or name[0] == 'Y':
        v[0] = 1
    if name[-1] == 'a' or name[-1] == 'e' or name[-1] == 'i' or name[-1] == 'o' or name[-1] == 'u' or name[-1] == 'y':
        v[1] = 1
    if name[-1] == 'a':
        v[2] = 1
    if name[-1] == 'y':
        v[3] = 1
    if debug:
        print(f"Feature vector for {name}:\n{v.astype(int)}\n")
    return v

In [3]:
# This function opens a file and calls the hashfeatures function on it to convert its contents into feature vectors
def name2features(filename, d=128, FIX=3, LoadFile=True):
    """
    Output:
        X : n feature vectors of dimension d, (nxd)
    """
    
    # read in baby names
    if LoadFile:
        with open(filename, 'r') as f:
            names = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        names = filename.split('\n')
    n = len(names)
    X = np.zeros((n, d))
    for i in range(n):
        X[i,:] = hashfeatures1(names[i])
    #print(X)
    return (X, n)

In [20]:
def genTrainFeatures(dimension=4):
    """
    Input: 
        dimension: desired dimension of the features
    Output: 
        allfeatures: n feature vectors of dimensionality d (nxd)
        alllabels: n labels (-1 = girl, +1 = boy) (n)
    """
    
    # Load in the data
    girlsTrain = "GirlsTrain.txt"
    boysTrain = "BoysTrain.txt"
    girlfeatures, numgirls = name2features(girlsTrain,4)
    boyfeatures, numboys = name2features(boysTrain,4)
    allfeatures = np.concatenate([girlfeatures,boyfeatures])
    
    # Generate Labels
    girllabels = np.ones(numgirls) * -1
    boylabels = np.ones(numboys)
    allLabels = np.concatenate([girllabels,boylabels])
    
    return (allfeatures,allLabels)

In [21]:
def naivebayesPY(X, Y):
    """
    naivebayesPY(X, Y) returns [pos,neg]

    Computation of P(Y)
    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (nx1)

    Output:
        pos: probability p(y=1)
        neg: probability p(y=-1)
    """
    
    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    Y = np.concatenate([Y, [-1,1]])
    n = len(Y)
    arrsum = np.sum(Y)
    
    # SOLUTION - based on system of linear equations, solved for posCount with substitution
    # arrSum = 1(posCount) - 1(negCount)
    # n = posCount + negCount
    
    posCount = (n + arrsum)/2
    
    pos = (posCount)/n
    neg = (n-posCount)/n
    
    #print("Probability of +1 is",pos)
    #print("Probability of -1 is",neg)
    return [pos,neg]

In [14]:
naivebayesPY(allfeatures,allLabels)

[0.4932084309133489, 0.506791569086651]

In [22]:
def naivebayesPXY(X,Y):
    """
    naivebayesPXY(X, Y) returns [posprob,negprob]
    
    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (n)
    
    Output:
        posprob: probability vector of p(x_alpha = 1|y=1)  (d)
            probability that feature d is 1 (true, or observed), given that the observed name is 1 (boy)
        negprob: probability vector of p(x_alpha = 1|y=-1) (d)
            probability that feature d is 1 (true, or observed), given that the observed name is 0 (girl)
    """
    
    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    n, d = X.shape
    X = np.concatenate([X, np.ones((2,d)), np.zeros((2,d))])
    Y = np.concatenate([Y, [-1,1,-1,1]])
    
    # identify observations which are boys (slicing/indexing)
    # count total number of boys
    # across all of the d features, count the number of boy observations that are HOT, divide by total num of boys
    
    # I think this could be accomplished with just a sum, once I use some condtion on the Y vector (==1) to index the 
    # appropriate part of X
    
    # identify observations which are girls(slicing/indexing)
    #print(X)
    #print(Y)
    
    # Boys, Y=1
    boys = X[Y == 1]
    #print(boys)
    b,d = boys.shape
    #print("There are",b,"boys with",d,"features each")
    #Want to sum down the columns
    boys_hot = np.sum(boys,axis=0)
    #print(boys_hot)
    posprob = boys_hot / b
    #print(posprob)
    
    # Girls, Y=-1
    girls = X[Y == -1]
    g,d = girls.shape
    #print("There are",g,"girls with",d,"features each")
    girls_hot = np.sum(girls,axis=0)
    negprob = girls_hot / g
    
    return [posprob,negprob]

In [16]:
naivebayesPXY(allfeatures,allLabels)

[array([0.19639469, 0.29601518, 0.01802657, 0.07305503]),
 array([0.26592798, 0.72299169, 0.37026777, 0.10710988])]

In [23]:
def loglikelihood(posprob, negprob, X_test, Y_test):
    """
    loglikelihood(posprob, negprob, X_test, Y_test) returns loglikelihood of each point in X_test
    
    Input:
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
        Y_test : labels (-1 or +1) (n)
    
    Output:
        loglikelihood of each point in X_test (n)
    """
    #print("Posprob distribution is:",posprob)
    posprob_c = np.ones(posprob.shape) - posprob # complement of posprob
    #print("Complement of Posprob distribution is:",posprob_c)
    
    #print("Negprob distribution is:",negprob)
    negprob_c = np.ones(negprob.shape) - negprob # complement of negprob
    #print("Complement of Negprob distribution is:",negprob_c)
    
    #print("Matrix of feature vectors is",testX)
    X_test_inv = np.zeros(X_test.shape)
    X_test_inv[X_test == 0] = 1 # create matrix of same shape as X_test, with 0's and 1's switched
    #print("Inverted matrix of feature vectors is",X_test_inv)
    
    probdist = np.zeros(X_test.shape)
    probdist[Y_test==1] = posprob
    probdist[Y_test==-1]= negprob
    #print(probdist)

    probdist_c = np.zeros(X_test.shape)
    probdist_c[Y_test==1] = posprob_c
    probdist_c[Y_test==-1]= negprob_c
    #print(probdist_c)

    loglike = np.multiply(X_test,probdist) + np.multiply(X_test_inv,probdist_c)
    #print("Log likelihood is\n",loglike)
    loglike = np.log(loglike)
    #print(loglike)
    loglike = np.sum(loglike,axis=1)
    #print(loglike)
    
    return loglike
    

In [37]:
def naivebayes_pred(pos, neg, posprob, negprob, X_test):
    """
    naivebayes_pred(pos, neg, posprob, negprob, X_test) returns the prediction of each point in X_test
    
    Input:
        pos: class probability for the negative class
        neg: class probability for the positive class
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
    
    Output:
        prediction of each point in X_test (n)
    """
    # call loglikelihood to test (1) +1 label and (2) -1 labels
        # add log of each class prior prob to each resulting output
    # combine the two resulting nx1 matrices
    # call argmax along the specified axis
    # use indexing to replace values in resulting array (replace 0 with 1, replace 1 with -1)
    
    n,d = X_test.shape
    
    plusonelabel = np.ones(n)
    #print(plusonelabel)
    pos_loglike = loglikelihood(posprob, negprob, X_test, plusonelabel)
    #print("Log likelihood of positive class\n",pos_loglike)
    pos_loglike = pos_loglike + np.log(pos)
    #print("With class prior probability\n",pos_loglike)
    
    minusonelabel = -1 * np.ones(n)
    #print(minusonelabel)
    neg_loglike = loglikelihood(posprob, negprob, X_test, minusonelabel)
    #print("Log likelihood of negative class\n",neg_loglike)
    neg_loglike = neg_loglike + np.log(neg)
    #print("With class prior probability\n",neg_loglike)
    
    combined = np.vstack((pos_loglike,neg_loglike))
    #print(combined)
    
    best = np.argmax(combined,axis=0)
    #print(best)
    best[best == 1] = -1
    best[best == 0] = 1
    
    #print(best)
    return best

In [38]:
testX = np.array([[1,1,1],[0,1,1],[1,1,0],[0,0,0]])
testY = np.array([-1,-1,1,1])
testpos, testneg = naivebayesPY(testX,testY)
test_posprob, test_negprob = naivebayesPXY(testX,testY)

#test_posprob = [0.5,  0.5,  0.25]
#test_negprob = [0.5,  0.75, 0.75]
naivebayes_pred(0.5,0.5,test_posprob,test_negprob,testX)

array([-1, -1,  1,  1])

In [39]:
#DIMS = 128
#DIMS = 175
DIMS = 4
print('Loading data ...')
X,Y = genTrainFeatures(DIMS)
print('Training classifier ...')
pos, neg = naivebayesPY(X, Y)
posprob, negprob = naivebayesPXY(X, Y)
error = np.mean(naivebayes_pred(pos, neg, posprob, negprob, X) != Y)
print('Training error: %.2f%%' % (100 * error))

while True:
    print('Please enter a baby name (press enter with empty box to stop prompt)>')
    yourname = input()
    if len(yourname) < 1:
        break
    xtest,numnames = name2features(yourname,d=DIMS,LoadFile=False)
    pred = naivebayes_pred(pos, neg, posprob, negprob, xtest)
    if pred > 0:
        print("%s, I am sure you are a baby boy.\n" % yourname)
    else:
        print("%s, I am sure you are a baby girl.\n" % yourname)

Loading data ...
Training classifier ...
Training error: 28.60%
Please enter a baby name (press enter with empty box to stop prompt)>
Emily
Emily, I am sure you are a baby girl.

Please enter a baby name (press enter with empty box to stop prompt)>
Rachel
Rachel, I am sure you are a baby boy.

Please enter a baby name (press enter with empty box to stop prompt)>

