# Naive Bayes on uncanned data from US Census Bureau 
Table taken from page 7 of report (https://www.census.gov/content/dam/Census/library/publications/2015/acs/acs-30.pdf)

# Set up and initializations

Run this at the start.

In [7]:
import pandas as pd
from IPython.display import display

# Dictionary that maps from abbreviation to feature in table (i.e. r0 = 'White alone')
from dictionary import dictionary as dictt 

# Initializing tables
maritalStatus = pd.read_csv('data/maritalStatus.csv') 
race = pd.read_csv('data/race.csv') 
nativity = pd.read_csv('data/nativity.csv')   
education = pd.read_csv('data/education.csv') 
employment = pd.read_csv('data/employment.csv') 
income = pd.read_csv('data/income.csv') 
tenure = pd.read_csv('data/tenure.csv')  

# Setting values for percentages of each marriage status
mStats = maritalStatus['Percentage']

# Helper Methods

```jointP(event, numMarriages)``` returns the joint probability of an event and a marriage outcome. It expects the following as inputs:
1. event is a string abbreviation of a subfeature (i.e. r0 --> 'White Alone')
2. numMarriages is an int, where the number indicates number of marriages (i.e. 0 for never, 1 for once, 2 for twice, 3 for three times or more)

```conditionalP(event, numMarriages)``` returns the conditional probability between an event and a marriage outcome. Expects the same inputs as jointP

In [8]:
# Returns the joint probability of an event and a marriage outcome
def jointP(event, numMarriages):
    # Look up feature mapped to event abbreviation in dictionary, dictt
    e = dictt[event]
    if (len(event) == 2):
        feature = event[0]
        if (feature == 'r'):
            return race[e][numMarriages]
        elif (feature == 'n'):
            return nativity[e][numMarriages]
        elif (feature == 'i'):
            return income[e][numMarriages]
        elif (feature == 't'):
            return tenure[e][numMarriages]
        else:
            return "Unacceptable input. Feature not found."
    else:
        feature = event[:2]
        if (feature == 'ed'):
            return education[e][numMarriages]
        elif (feature == 'em'):
            return employment[e][numMarriages]
        else:
            return "Unacceptable input. Feature not found."
    
# Returns the conditional probability between an event and a marriage outcome
def conditionalP(event, numMarriages):
    return jointP(event, numMarriages)/mStats[numMarriages]


# Prediction

Implements a Naive Bayes algorithm to predict the probabilities of a person being married once, twice, three or more times, or never based on his/her background in race, nativity, education, employment, income, and/or tenure.

In [27]:
from Queue import PriorityQueue

# Prints out probabilities of all the marriage outcome in ASCENDING order given a person's background
# features is an array of string abbreviations of a person's background
def predict(features):
    # Initializing relative probabilityies for 4 types of marriages
    never = mStats[0]
    once = mStats[1]
    twice = mStats[2]
    three = mStats[3]
    
    # Iterate through each feature
    for f in features:
        # Calculate relative probability for 4 marriage outcomes
        never *= conditionalP(f, 0)
        once *= conditionalP(f, 1)
        twice *= conditionalP(f, 2)
        three *= conditionalP(f, 3)
    
    # Calculate actual probability for 4 marriage outcomes
    total = never + once + twice + three
    p0 = never/total*100
    p1 = once/total*100
    p2 = twice/total*100
    p3 = three/total*100
    
    # Create a minheap to store probabilities for each marriage outcome
    q = PriorityQueue()
    
    # Add to the heap a tuple that stores [probability of an outcome, string representation of the outcome]
    q.put((round(p0, 2),'never'))
    q.put((round(p1, 2),'once'))
    q.put((round(p2, 2),'twice'))
    q.put((round(p3, 2),'three'))
    
    # Pop the tuples from the heap. Results are in ascending order.
    while not q.empty():
        print q.get()
    print "\n"

# Test

Below are made-up full feature vectors to test the model. The people are both made-up and real.

In [37]:
# Person 1: 2 or more race, native, less than high school, employed, income over $100k, and rents home
print ('Predicting person who\'s 2 or more race, native, less than high school, employed, income over $100k, and rents home:')
predict(['r7', 'n0', 'ed0', 'em0', 'i4', 't1'])

# Person 2: Hugh Hefner
print ('Predicting person who\'s white alone, native, bachelor\'s degree, employed, income over $100k, and owns home:')
print ('In reality, this person has been married 3 times.')
predict(['r0', 'n0', 'ed3', 'em0', 'i4', 't0'])

# Person 3: hispanic (of any race), foreign born, less than high school, unemployed, income less than $25k, and rents home:
print ('Predicting person who\'s hispanic (of any race), foreign born, less than high school, unemployed, income less than $25k, and rents home:')
predict(['r8', 'n1', 'ed0', 'em1', 'i0', 't1'])

# Person 4: American Indian and Alaska Native alone, native, high school graduate, and income ranges from $75,000 to $99,999
print ('Prediction person who\'s American Indian and Alaska Native alone, native, high school graduate, and income ranges from $75,000 to $99,999')
predict(['r3', 'n0', 'ed1', 'i3'])

Predicting person who's 2 or more race, native, less than high school, employed, income over $100k, and rents home:
(2.33, 'three')
(9.08, 'twice')
(41.6, 'once')
(46.98, 'never')


Predicting person who's white alone, native, bachelor's degree, employed, income over $100k, and owns home:
In reality, this person has been married 3 times.
(2.22, 'three')
(3.96, 'never')
(14.99, 'twice')
(78.84, 'once')


Predicting person who's hispanic (of any race), foreign born, less than high school, unemployed, income less than $25k, and rents home:
(0.12, 'three')
(1.03, 'twice')
(11.69, 'once')
(87.16, 'never')


Prediction person who's American Indian and Alaska Native alone, native, high school graduate, and income ranges from $75,000 to $99,999
(6.07, 'three')
(17.41, 'never')
(20.36, 'twice')
(56.16, 'once')


