In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC 
from sklearn.metrics import classification_report


In [32]:
censuses_data = pd.read_csv("./CSV/censuses.csv")
censuses_data = censuses_data[censuses_data["census_year"] == 1900]
censuses_data.count()

line_number               808
house_number              808
street_name               808
last_name                 808
given_name                808
relation                  808
color_race                808
sex                       808
age_at_last_birthday      808
occupation                808
own_or_rent               808
owned_free_or_mortgage    808
house_or_farm             808
my_notes                  808
file_name                 808
census_year               808
dtype: int64

In [33]:
# Separate out the occupation which are null/ none (LATER we can test our model again these cases)
occupation_missing_mask = (censuses_data["occupation"].str.contains('None')) | (censuses_data["occupation"].str.contains('Null'))
df_missing_occupation = censuses_data[occupation_missing_mask]
df_missing_occupation.count()

line_number               321
house_number              321
street_name               321
last_name                 321
given_name                321
relation                  321
color_race                321
sex                       321
age_at_last_birthday      321
occupation                321
own_or_rent               321
owned_free_or_mortgage    321
house_or_farm             321
my_notes                  321
file_name                 321
census_year               321
dtype: int64

In [34]:
# Prepare our X & y to create model with labelled 'occupation' column

# prepare new df where occupation column is All labelled
df_all_occupation = censuses_data[occupation_missing_mask == False]

# include following features that MAY influence occupation field -
# relation, color_race, sex, age_at_last_birthday
X = df_all_occupation[["relation","color_race","sex","age_at_last_birthday"]]
y = df_all_occupation["occupation"]
print(X.shape, y.shape)

(487, 4) (487,)


In [35]:
# using get_dummies get binary coded for categorical data
X = pd.get_dummies(X, columns=["relation","color_race", "sex"])
X.head()

Unnamed: 0,age_at_last_birthday,relation_Adopted Daughter,relation_Boarder,relation_Brother,relation_Brother-in-law,relation_Cousin,relation_Daughter,relation_Daughter-in-law,relation_Father,relation_Granddaughter,...,relation_Son,relation_Son-in-law,relation_Step-Daughter,relation_Stepdaughter,relation_Stepson,relation_Wife,color_race_B,color_race_W,sex_F,sex_M
0,26,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,27,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,25,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,8,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [36]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=1/3)

In [37]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [38]:
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.528


In [None]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))