In [1]:
# Dependencies
import numpy as np
import pandas as pd

In [13]:
censuses_data = pd.read_csv("./CSV/censuses.csv")
# censuses_data = censuses_data[censuses_data["census_year"] == 1900]
censuses_data.count()

line_number               4090
house_number              4090
street_name               4090
last_name                 4090
given_name                4090
relation                  4090
color_race                4090
sex                       4090
age_at_last_birthday      4090
occupation                4090
own_or_rent               4090
owned_free_or_mortgage    4090
house_or_farm             4090
my_notes                  4090
file_name                 4090
census_year               4090
dtype: int64

In [15]:
# Separate out the occupation which are null/ none (LATER we can test our model again these cases)
occupation_missing_mask = (censuses_data["occupation"].str.contains('None')) | (censuses_data["occupation"].str.contains('Null'))
df_missing_occupation = censuses_data[occupation_missing_mask]

df_missing_occupation.count()

line_number               1389
house_number              1389
street_name               1389
last_name                 1389
given_name                1389
relation                  1389
color_race                1389
sex                       1389
age_at_last_birthday      1389
occupation                1389
own_or_rent               1389
owned_free_or_mortgage    1389
house_or_farm             1389
my_notes                  1389
file_name                 1389
census_year               1389
dtype: int64

In [89]:
# prepare new df where occupation column is All labelled
df_all_occupation = censuses_data[occupation_missing_mask == False]

# include following features that MAY influence occupation field -
# color_race, sex, age_at_last_birthday, occupation
newdata = df_all_occupation.loc[:,["color_race","sex","age_at_last_birthday", "occupation"]]
newdata.count()

color_race              2701
sex                     2701
age_at_last_birthday    2701
occupation              2701
dtype: int64

In [90]:
newdata.groupby(['occupation']).count().sort_values(by=['sex'])

Unnamed: 0_level_0,color_race,sex,age_at_last_birthday
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Laundress At Home,1,1,1
Nurse For Private Family,1,1,1
Office Clerk At Packing Company,1,1,1
Office Girl,1,1,1
Office Girl At Joshyneian,1,1,1
Office Girl At Lawyer,1,1,1
Office Girl At Physician,1,1,1
Office Girl At Wood Business,1,1,1
Oil Scout At Oil Company,1,1,1
Oiler At Garage,1,1,1


In [91]:
from fuzzywuzzy import fuzz, process

In [97]:
correct_occupations=["Agent","Cook","Barber","Baker","Beauty","Washwoman","Worker", "Wringer","Yard" ]
# replace Yard by Yardman

In [98]:
def fuzzy_match_occ(occupation):
#     print('occupation',occupation)
    if occupation in correct_occupations:  # might want to make this a dict for O(1) lookups
#         print('in correct occupation')
        return occupation, 100

    new_name, score = process.extractOne(occupation, correct_occupations)
#     print('new_name', new_name)
#     print('score',score)
    if score < 90:
#         print('less than 90 % match')
        return occupation, score
    else:
#         print('Good Match!!!!!!')
        return new_name, score   

In [99]:
newdata['corrected'], newdata['score'] = zip(*newdata['occupation'].apply(fuzzy_match_occ))
newdata.groupby(['corrected']).count()
# newdata.groupby(['occupation']).count().sort_values(by=['sex'])

Unnamed: 0_level_0,color_race,sex,age_at_last_birthday,occupation,score
corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Laundress At Home,1,1,1,1,1
Waitress At Restaurant,1,1,1,1,1
Abstracter At Loan And Investments,1,1,1,1,1
Accountant At Light And Power Company,1,1,1,1,1
Actor At Traveling Show,2,2,2,2,2
Actress At Theatrical Company,1,1,1,1,1
Agent,5,5,5,5,5
Assistant Warehouse At Refining Company,1,1,1,1,1
Assorter At Paper Factory,1,1,1,1,1
Asst. Manager At Theatre,1,1,1,1,1


In [None]:
def fuzzy_match_sort (match_name,choices,match_score):
    possibilities = process.extract(
        match_name,
        choices,
        limit=100,
        scorer = fuzz.token_sort_ratio)
    return [possible for possible in possibilities if possible[1] > match_score]

In [None]:
fuzzy_match_sort('Workwoman Home',newdata.occupation.values.tolist(),25)

In [6]:
# # Create dictionary to convert occupation into unique numbers
# occupation = {}
# counter = 1
# for occ in newdata["occupation"].unique():
#     occupation[occ] = counter
#     counter += 1
# print(occupation)

# newdata["occupation"].replace(occupation, inplace=True) # replace occupation text by number
# newdata.head()

In [7]:
def age_clean(data,under_1_value = 1, pandas = 'yes'):
    clean_age = []
    if pandas == 'yes':
        data_lst = data
    else:
        data_lst = data
    for datum in data_lst:
        if 'month' in str(datum):
            if 'and' in str(datum):
                clean_age.append(str(datum).split(' ')[0])
            else:
                clean_age.append(str(under_1_value))
        else:
            clean_age.append(datum)
    return clean_age

In [8]:
age = newdata["age_at_last_birthday"]
clean_age = age_clean(newdata['age_at_last_birthday'])
print(type(clean_age))
newdata["age_at_last_birthday"] = clean_age
newdata.count()

<class 'list'>


color_race              808
sex                     808
age_at_last_birthday    808
class_occ               808
dtype: int64

In [9]:
X = newdata.drop("class_occ", axis=1)
y = newdata["class_occ"]

print(X.shape, y.shape)

(808, 3) (808,)


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=1/2)
print(X_train.shape, y_train.shape)

(404, 3) (404,)


In [11]:
X_test.count()

color_race              404
sex                     404
age_at_last_birthday    404
dtype: int64

In [12]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_train_ohe = ohe.fit_transform(X_train)
X_test_ohe = ohe.transform(X_test)

print(X_train_ohe.shape, X_test_ohe.shape)

(404, 75) (404, 75)


In [13]:
from sklearn.linear_model import LogisticRegression
# classifier = LogisticRegression()

# classifier.fit(X_train_ohe,y_train)

clf = LogisticRegression(random_state=0, solver='lbfgs',
                          multi_class='multinomial').fit(X_train_ohe,y_train)

print(f"Training Data Score: {clf.score(X_train_ohe, y_train)}")
print(f"Testing Data Score: {clf.score(X_test_ohe, y_test)}")

Training Data Score: 0.8465346534653465
Testing Data Score: 0.7623762376237624


In [20]:
for label, original_class in zip(X_train_ohe, X_train):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: color_race
Encoded Label: [1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
------------
Original Class: sex
Encoded Label: [1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
------------
Original Class: age_at_last_birthday
Encoded Label: [0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
------------
