In [17]:
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [9]:
# From Cornell course CIS532 project content:
# Purpose of function is to convert a name into a feature vector
def hashfeatures(baby, d, FIX, debug=False):
    """
    Input:
        baby : a string representing the baby's name to be hashed
        
        d: the number of dimensions to be in the feature vector
        
        FIX: the number of chunks to extract and hash from each string
        
        debug: a bool for printing debug values (default False)
    
    Output:
        v: a feature vector representing the input string
    """
    
    v = np.zeros(d)
    for m in range(1, FIX+1):
        prefix = baby[:m] + ">"
        P = hash(prefix) % d
        v[P] = 1
        
        suffix = "<" + baby[-m:]
        S = hash(suffix) % d
        v[S] = 1
        
        if debug:
            print(f"Split {m}/{FIX}:\t({prefix}, {suffix}),\t1s at indices [{P}, {S}]")
    if debug:
        print(f"Feature vector for {baby}:\n{v.astype(int)}\n")
    return v

In [10]:
# From Cornell course CIS532 project content:
def name2features(filename, d=128, FIX=3, LoadFile=True, debug=False):
    """
    Output:
        X : n feature vectors of dimension d, (nxd)
    """
    
    # read in baby names
    if LoadFile:
        with open(filename, 'r') as f:
            babynames = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        babynames = filename.split('\n')
    n = len(babynames)
    X = np.zeros((n, d))
    for i in range(n):
        X[i,:] = hashfeatures(babynames[i], d, FIX)
    return (X, babynames) if debug else X

In [12]:
# From Cornell course CIS532 project content:
# TO DO: to recreate/generate boys.train and girls.train files before running this code

"""
Xboys, namesBoys = name2features("boys.train", d=128, FIX=3, debug=True)
Xgirls, namesGirls = name2features("girls.train", d=128, FIX=3, debug=True)
X = np.concatenate([Xboys[:20], Xgirls[:20]], axis=0)

plt.figure(figsize=(20, 8))
ax = sns.heatmap(X.astype(int), cbar=False)
ax.set_xlabel('feature indices')
ax.set_ylabel('baby names')
ticks = ax.set_yticks(np.arange(40, dtype=int))
ticklabels = ax.set_yticklabels(namesBoys[:20] + namesGirls[:20])
plt.show()
"""

# Certain feature vectors might be white for many names because (1) many names have that feature in common or 
# (2) there are hash collisions occurring due to an insufficiently large d value.

def genTrainFeatures(dimension=128):
    """
    Input: 
        dimension: desired dimension of the features
    Output: 
        X: n feature vectors of dimensionality d (nxd)
        Y: n labels (-1 = girl, +1 = boy) (n)
    """
    
    # Load in the data
    Xgirls = name2features("girls.train", d=dimension)
    Xboys = name2features("boys.train", d=dimension)
    X = np.concatenate([Xgirls, Xboys])
    
    # Generate Labels
    Y = np.concatenate([-np.ones(len(Xgirls)), np.ones(len(Xboys))])
    
    # shuffle data into random order
    ii = np.random.permutation([i for i in range(len(Y))])
    
    return X[ii, :], Y[ii]

In [13]:
# From Cornell course CIS532 project content:
"""
X, Y = genTrainFeatures(128)
print(f'Shape of training data: {X.shape}')
print(f'X:\n{X.astype(int)}')
print(f'Y:\n{Y.astype(int)}')
"""

"\nX, Y = genTrainFeatures(128)\nprint(f'Shape of training data: {X.shape}')\nprint(f'X:\n{X.astype(int)}')\nprint(f'Y:\n{Y.astype(int)}')\n"

In [18]:
# MY CODE
def naivebayesPY(X, Y):
    """
    naivebayesPY(X, Y) returns [pos,neg]

    Computation of P(Y)
    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (nx1)

    Output:
        pos: probability p(y=1)
        neg: probability p(y=-1)
    """
    
    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    Y = np.concatenate([Y, [-1,1]])
    n = len(Y)
    arrsum = np.sum(Y)
    
    # SOLUTION - based on system of linear equations, solved for posCount with substitution
    # arrSum = 1(posCount) - 1(negCount)
    # n = posCount + negCount
    
    posCount = (n + arrsum)/2
    
    pos = (posCount)/n
    neg = (n-posCount)/n
    
    #print("Probability of +1 is",pos)
    #print("Probability of -1 is",neg)
    return [pos,neg]

In [19]:
# Tests for naivesbayesPY
testY1 = np.array([-1,1,-1,1,-1,1,1]) # one more +1 than -1, so sum will be +1
naivebayesPY(testY1, testY1)

[0.5555555555555556, 0.4444444444444444]

In [20]:
testY2 = np.array([-1,-1,1]) # one more -1 than +1, so sum will be -1
naivebayesPY(testY2, testY2)

[0.4, 0.6]

In [21]:
testY3 = np.array([1,1,1]) # one more -1 than +1, so sum will be -1
naivebayesPY(testY3, testY3)

[0.8, 0.2]

In [22]:
def naivebayesPXY(X,Y):
    """
    naivebayesPXY(X, Y) returns [posprob,negprob]
    
    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (n)
    
    Output:
        posprob: probability vector of p(x_alpha = 1|y=1)  (d)
            probability that feature d is 1 (true, or observed), given that the observed name is 1 (boy)
        negprob: probability vector of p(x_alpha = 1|y=-1) (d)
            probability that feature d is 1 (true, or observed), given that the observed name is 0 (girl)
    """
    
    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    n, d = X.shape
    X = np.concatenate([X, np.ones((2,d)), np.zeros((2,d))])
    Y = np.concatenate([Y, [-1,1,-1,1]])
    
    # identify observations which are boys (slicing/indexing)
    # count total number of boys
    # across all of the d features, count the number of boy observations that are HOT, divide by total num of boys
    
    # I think this could be accomplished with just a sum, once I use some condtion on the Y vector (==1) to index the 
    # appropriate part of X
    
    # identify observations which are girls(slicing/indexing)
    #print(X)
    #print(Y)
    
    # Boys, Y=1
    boys = X[Y == 1]
    #print(boys)
    b,d = boys.shape
    #print("There are",b,"boys with",d,"features each")
    #Want to sum down the columns
    boys_hot = np.sum(boys,axis=0)
    #print(boys_hot)
    posprob = boys_hot / b
    #print(posprob)
    
    # Girls, Y=-1
    girls = X[Y == -1]
    g,d = girls.shape
    #print("There are",g,"girls with",d,"features each")
    girls_hot = np.sum(girls,axis=0)
    negprob = girls_hot / g
    
    return [posprob,negprob]

In [23]:
# Test for naivebayesPXY
testX = np.array([[0,0,0],[1,1,1],[2,2,2],[3,3,3],[4,4,4]])
testY = np.array([-1,1,-1,1,-1])
naivebayesPXY(testX,testY)

[array([1.25, 1.25, 1.25]), array([1.4, 1.4, 1.4])]

In [24]:
"""
X, Y = genTrainFeatures(128)
posprob, negprob = naivebayesPXY(X, Y)
probs = pd.DataFrame({'feature': np.arange(128, dtype=int), 'boys': posprob, 'girls': negprob})

plt.figure(figsize=(20, 4))
ax = sns.lineplot(x='feature', y='value', hue='variable', data=pd.melt(probs, ['feature']))
ax.set_xlabel('feature indices')
ax.set_ylabel('probability')
plt.show()
"""

"\nX, Y = genTrainFeatures(128)\nposprob, negprob = naivebayesPXY(X, Y)\nprobs = pd.DataFrame({'feature': np.arange(128, dtype=int), 'boys': posprob, 'girls': negprob})\n\nplt.figure(figsize=(20, 4))\nax = sns.lineplot(x='feature', y='value', hue='variable', data=pd.melt(probs, ['feature']))\nax.set_xlabel('feature indices')\nax.set_ylabel('probability')\nplt.show()\n"

In [25]:
def loglikelihood(posprob, negprob, X_test, Y_test):
    """
    loglikelihood(posprob, negprob, X_test, Y_test) returns loglikelihood of each point in X_test
    
    Input:
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
        Y_test : labels (-1 or +1) (n)
    
    Output:
        loglikelihood of each point in X_test (n)
    """
    
    # BOYS
    # Y = +1
    boys = X_test[Y_test == 1]
    b,d = boys.shape
    
    # GIRLS
    # Y = -1
    girls = X_test[Y_test == -1]
    #print(girls)
    g,d = girls.shape
    
    # QUESTION - doesn't each point get 2 probabilities, so we can compare the probability of a name being 
    # boy vs girl?
    # QUESTION - where do these labels come from? I thought the test points didn't have labels?
    

In [26]:
def naivebayes_pred(pos, neg, posprob, negprob, X_test):
    """
    naivebayes_pred(pos, neg, posprob, negprob, X_test) returns the prediction of each point in X_test
    
    Input:
        pos: class probability for the negative class
        neg: class probability for the positive class
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
    
    Output:
        prediction of each point in X_test (n)
    """
    # YOUR CODE HERE
    raise NotImplementedError()

In [27]:
"""
DIMS = 128
print('Loading data ...')
X,Y = genTrainFeatures(DIMS)
print('Training classifier ...')
pos, neg = naivebayesPY(X, Y)
posprob, negprob = naivebayesPXY(X, Y)
error = np.mean(naivebayes_pred(pos, neg, posprob, negprob, X) != Y)
print('Training error: %.2f%%' % (100 * error))

while True:
    print('Please enter a baby name (press enter with empty box to stop prompt)>')
    yourname = input()
    if len(yourname) < 1:
        break
    xtest = name2features(yourname,d=DIMS,LoadFile=False)
    pred = naivebayes_pred(pos, neg, posprob, negprob, xtest)
    if pred > 0:
        print("%s, I am sure you are a baby boy.\n" % yourname)
    else:
        print("%s, I am sure you are a baby girl.\n" % yourname)
"""

'\nDIMS = 128\nprint(\'Loading data ...\')\nX,Y = genTrainFeatures(DIMS)\nprint(\'Training classifier ...\')\npos, neg = naivebayesPY(X, Y)\nposprob, negprob = naivebayesPXY(X, Y)\nerror = np.mean(naivebayes_pred(pos, neg, posprob, negprob, X) != Y)\nprint(\'Training error: %.2f%%\' % (100 * error))\n\nwhile True:\n    print(\'Please enter a baby name (press enter with empty box to stop prompt)>\')\n    yourname = input()\n    if len(yourname) < 1:\n        break\n    xtest = name2features(yourname,d=DIMS,LoadFile=False)\n    pred = naivebayes_pred(pos, neg, posprob, negprob, xtest)\n    if pred > 0:\n        print("%s, I am sure you are a baby boy.\n" % yourname)\n    else:\n        print("%s, I am sure you are a baby girl.\n" % yourname)\n'

In [28]:
def name2features2(filename, d=128, FIX=3, LoadFile=True):
    """
    Output:
        X : n feature vectors of dimension d, (nxd)
    """
    # read in baby names
    if LoadFile:
        with open(filename, 'r') as f:
            babynames = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        babynames = filename.split('\n')
    n = len(babynames)
    X = np.zeros((n, d))
    for i in range(n):
        X[i,:] = hashfeatures(babynames[i], d, FIX)
        
    # YOUR CODE HERE
    raise NotImplementedError()
    return X