In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
censuses_data = pd.read_csv("./CSV/censuses.csv")

In [3]:
censuses_data.describe()
censuses_data = censuses_data[censuses_data["census_year"] == 1900]
censuses_data.count()

line_number               808
house_number              808
street_name               808
last_name                 808
given_name                808
relation                  808
color_race                808
sex                       808
age_at_last_birthday      808
occupation                808
own_or_rent               808
owned_free_or_mortgage    808
house_or_farm             808
my_notes                  808
file_name                 808
census_year               808
dtype: int64

In [4]:
# Separate out the occupation which are null/ none (LATER we can test our model again these cases)
occupation_missing_mask = (censuses_data["occupation"].str.contains('None')) | (censuses_data["occupation"].str.contains('Null'))
df_missing_occupation = censuses_data[occupation_missing_mask]
df_missing_occupation.count()

line_number               321
house_number              321
street_name               321
last_name                 321
given_name                321
relation                  321
color_race                321
sex                       321
age_at_last_birthday      321
occupation                321
own_or_rent               321
owned_free_or_mortgage    321
house_or_farm             321
my_notes                  321
file_name                 321
census_year               321
dtype: int64

In [5]:
# Prepare our X & y to create model with labelled 'occupation' column

# prepare new df where occupation column is All labelled
df_all_occupation = censuses_data[occupation_missing_mask == False]
# df_all_occupation.count()

# include following features that MAY influence occupation field -
# relation, color_race, sex, age_at_last_birthday
X = df_all_occupation[["relation","color_race","sex","age_at_last_birthday"]]
y = df_all_occupation["occupation"]
print(X.shape, y.shape)

(487, 4) (487,)


In [6]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)

In [6]:
# using get_dummies get binary coded for categorical data
X = pd.get_dummies(X, columns=["relation","color_race", "sex"])
X.head()

Unnamed: 0,age_at_last_birthday,relation_Adopted Daughter,relation_Boarder,relation_Brother,relation_Brother-in-law,relation_Cousin,relation_Daughter,relation_Daughter-in-law,relation_Father,relation_Granddaughter,...,relation_Son,relation_Son-in-law,relation_Step-Daughter,relation_Stepdaughter,relation_Stepson,relation_Wife,color_race_B,color_race_W,sex_F,sex_M
0,26,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,27,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,25,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,8,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [7]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: Servant
Encoded Label: 48
------------
Original Class: Drayman
Encoded Label: 19
------------
Original Class: Laundress
Encoded Label: 36
------------
Original Class: Servant
Encoded Label: 48
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: Day Laborer
Encoded Label: 17
------------
Original Class: Drayman
Encoded Label: 19
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: Janitor
Encoded Label: 33
------------
Original Class: Laundress
Encoded Label: 36
------------
Original Class: Laundress
Encoded Label: 36
------------
Original Class: Laundress
Encoded Label: 36
------------
Original Class: Servant
Encoded Label: 48
------------
Original Class: Servant
Encoded Label: 48
------------
Original Class: Day Laborer
Encoded Label: 17
------------
Original Class: Day Laborer
Encoded Label: 17
------------
Original Class: Laundress
Encoded Label: 36
------------
Original Class: Day Laborer
Encoded Label

Original Class: At School
Encoded Label: 1
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: Cook
Encoded Label: 15
------------
Original Class: Day Laborer
Encoded Label: 17
------------
Original Class: Day Laborer
Encoded Label: 17
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: Expressman
Encoded Label: 22
------------
Original Class: Janitor
Encoded Label: 33
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: At School
Encoded Label: 1
------------
Original Class: Cook
Encoded Label: 15
------------
Original Class: Drayman
Encoded Label: 19
------------
Original Class: Servant
Encoded Label: 48
------------
Original Class: Errand Boy
Encoded Label: 21
------

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=1, test_size=1/3)

In [9]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

Using TensorFlow backend.


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [10]:
X_train.count()

age_at_last_birthday         324
relation_Adopted Daughter    324
relation_Boarder             324
relation_Brother             324
relation_Brother-in-law      324
relation_Cousin              324
relation_Daughter            324
relation_Daughter-in-law     324
relation_Father              324
relation_Granddaughter       324
relation_Grandson            324
relation_Head                324
relation_Lodger              324
relation_Nephew              324
relation_Niece               324
relation_Partner             324
relation_Roomer              324
relation_Servant             324
relation_Sister              324
relation_Sister-in-law       324
relation_Son                 324
relation_Son-in-law          324
relation_Step-Daughter       324
relation_Stepdaughter        324
relation_Stepson             324
relation_Wife                324
color_race_B                 324
color_race_W                 324
sex_F                        324
sex_M                        324
dtype: int

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [12]:
classifier.fit(X_train,y_train_categorical)



ValueError: bad input shape (324, 59)

In [169]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.5308641975308642
Testing Data Score: 0.5460122699386503
