# Logistic Regression and Naive Bayes on canned data from UCI
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data

# Set up

Run this first. Imports modules and preprocesses data.

In [1]:
import pandas as pd
from IPython.display import display

# Read data
adult = pd.read_csv('adult.data.csv')

# Remove rows with '?' value in cell for columns: workclass, occupation, and native_country
adult = adult[adult.workclass.str.strip() != '?']
adult = adult[adult.occupation.str.strip() != '?']
adult = adult[adult.native_country.str.strip() != '?']

# Set labels to marital status
labels = adult['marital_status']

# Set features to everything but marital status
features = adult.drop('marital_status', axis=1) 

# One-hot encoding for features
features_processed = pd.get_dummies(features)

# Grab values of features and labels and set them to x, y
x = features_processed.values
y = labels.values


# Logistic Regression

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Instantiate the model
logreg = LogisticRegression()

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)

# fit the model with data
logreg.fit(x_train, y_train)

# Accuracy
print (logreg.score(x_train, y_train))
print (logreg.score(x_test, y_test))

0.693043045809
0.687360132615


# Naive Bayes

In [3]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)
print (clf.score(x_train, y_train))
print (clf.score(x_test, y_test))

0.645189810466
0.646166597596
