In [1]:
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [9]:
# From Cornell course CIS532 project content:
# Purpose of function is to convert a name into a feature vector
# This function extracts features in the form of prefixes and suffixes, and creates a hashed 1-hot encoding

def hashfeatures(baby, d, FIX, debug=False):
    """
    Input:
        baby : a string representing the baby's name to be hashed
        
        d: the number of dimensions to be in the feature vector
        
        FIX: the number of chunks to extract and hash from each string
        
        debug: a bool for printing debug values (default False)
    
    Output:
        v: a feature vector representing the input string
    """
    
    v = np.zeros(d)
    for m in range(1, FIX+1):
        prefix = baby[:m] + ">"
        P = hash(prefix) % d
        v[P] = 1
        
        suffix = "<" + baby[-m:]
        S = hash(suffix) % d
        v[S] = 1
        
        if debug:
            print(f"Split {m}/{FIX}:\t({prefix}, {suffix}),\t1s at indices [{P}, {S}]")
    if debug:
        print(f"Feature vector for {baby}:\n{v.astype(int)}\n")
    return v

In [2]:
# From Cornell course CIS532 project content:
# This function opens a file and calls the hashfeatures function on it to convert its contents into feature vectors
def name2features(filename, d=128, FIX=3, LoadFile=True, debug=False):
    """
    Output:
        X : n feature vectors of dimension d, (nxd)
    """
    
    # read in baby names
    if LoadFile:
        with open(filename, 'r') as f:
            babynames = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        babynames = filename.split('\n')
    n = len(babynames)
    X = np.zeros((n, d))
    for i in range(n):
        X[i,:] = hashfeatures(babynames[i], d, FIX)
    return (X, babynames) if debug else X

In [3]:
def naivebayesPY(X, Y):
    """
    naivebayesPY(X, Y) returns [pos,neg]

    Computation of P(Y)
    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (nx1)

    Output:
        pos: probability p(y=1)
        neg: probability p(y=-1)
    """
    
    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    Y = np.concatenate([Y, [-1,1]])
    n = len(Y)
    arrsum = np.sum(Y)
    
    # SOLUTION - based on system of linear equations, solved for posCount with substitution
    # arrSum = 1(posCount) - 1(negCount)
    # n = posCount + negCount
    
    posCount = (n + arrsum)/2
    
    pos = (posCount)/n
    neg = (n-posCount)/n
    
    #print("Probability of +1 is",pos)
    #print("Probability of -1 is",neg)
    return [pos,neg]

In [4]:
# Tests for naivesbayesPY
testY1 = np.array([-1,1,-1,1,-1,1,1]) # one more +1 than -1, so sum will be +1
naivebayesPY(testY1, testY1)

[0.5555555555555556, 0.4444444444444444]

In [5]:
testY2 = np.array([-1,-1,1]) # one more -1 than +1, so sum will be -1
naivebayesPY(testY2, testY2)

[0.4, 0.6]

In [6]:
testY3 = np.array([1,1,1]) # one more -1 than +1, so sum will be -1
naivebayesPY(testY3, testY3)

[0.8, 0.2]

In [7]:
def naivebayesPXY(X,Y):
    """
    naivebayesPXY(X, Y) returns [posprob,negprob]
    
    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (n)
    
    Output:
        posprob: probability vector of p(x_alpha = 1|y=1)  (d)
            probability that feature d is 1 (true, or observed), given that the observed name is 1 (boy)
        negprob: probability vector of p(x_alpha = 1|y=-1) (d)
            probability that feature d is 1 (true, or observed), given that the observed name is 0 (girl)
    """
    
    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    n, d = X.shape
    X = np.concatenate([X, np.ones((2,d)), np.zeros((2,d))])
    Y = np.concatenate([Y, [-1,1,-1,1]])
    
    # identify observations which are boys (slicing/indexing)
    # count total number of boys
    # across all of the d features, count the number of boy observations that are HOT, divide by total num of boys
    
    # I think this could be accomplished with just a sum, once I use some condtion on the Y vector (==1) to index the 
    # appropriate part of X
    
    # identify observations which are girls(slicing/indexing)
    #print(X)
    #print(Y)
    
    # Boys, Y=1
    boys = X[Y == 1]
    #print(boys)
    b,d = boys.shape
    #print("There are",b,"boys with",d,"features each")
    #Want to sum down the columns
    boys_hot = np.sum(boys,axis=0)
    #print(boys_hot)
    posprob = boys_hot / b
    #print(posprob)
    
    # Girls, Y=-1
    girls = X[Y == -1]
    g,d = girls.shape
    #print("There are",g,"girls with",d,"features each")
    girls_hot = np.sum(girls,axis=0)
    negprob = girls_hot / g
    
    return [posprob,negprob]

In [8]:
# Test for naivebayesPXY
testX = np.array([[0,0,0],[1,1,1],[2,2,2],[3,3,3],[4,4,4]])
testY = np.array([-1,1,-1,1,-1])
naivebayesPXY(testX,testY)

[array([1.25, 1.25, 1.25]), array([1.4, 1.4, 1.4])]

In [9]:
def loglikelihood(posprob, negprob, X_test):
    """
    loglikelihood(posprob, negprob, X_test) returns loglikelihood of each point in X_test
    
    Input:
        posprob: conditional probabilities for the positive class (d), probability of feature given class +1
        negprob: conditional probabilities for the negative class (d), probability of feature given class -1
        X_test : features (nxd)
    
    Output:
        log-likelihood of each point in X_test belonging to either class (nx2)
    """
    
    # Probability of each point belonging in BOYS (+1)
    llp = np.array([])
    # TODO: implement loglikelihood
    
    # Probability of each point belonging in GIRLS (-1)
    lln = np.array([])
    
    return (llp,lln)
    
    # Notes:
    # Original function definition from class project had input Y_test, which are presumably labels for the test 
    # points. This didn't make sense to me, so I removed it.
    

In [11]:
def naivebayes_pred(pos, neg, posprob, negprob, X_test):
    """
    naivebayes_pred(pos, neg, posprob, negprob, X_test) returns the prediction of each point in X_test
    
    Input:
        pos: class probability for the negative class
        neg: class probability for the positive class
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
    
    Output:
        prediction of each point in X_test (n)
    """
    # call loglikelihood to find two points of comparison.
    llp,lln = loglikelihood(posprob, negprob, X_test)
    
    # add class prior probability to each of the P(x|Y) vectors
    llp = llp + pos
    lln = lln + neg
    
    # TO DO: compare llp and lln element-wise, returning class of the maximum between the two
    
    # Notes:
    # Instead of loglikelihood returning a 2-tuple of nx1 vectors, it could return an nx2 vector, which would make it
    # easier to make the final prediction (use np.argmax). It would mean adding the class prior probability would
    # be slightly more complicated.