About the dataset:

https://archive.ics.uci.edu/ml/datasets/adult

In [1]:
#importing all libraries
import pandas as pd
import numpy as np

#metrics
#from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import scikitplot as skplt

#models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

#plotting
#import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline

In [2]:
pwd

'/Users/Zbyszek/Python/Practical ML/$50k earnings'

In [None]:
train = pd.read_hdf('../input/train.adult.h5')

In [None]:
train.sample(5)

In [None]:
train.info()

In [None]:
for column in train.columns:
    num = train[column].nunique()
    print("{}: {}".format(column, num))

#unique/nunique/value_counts

In [None]:
for column in train.drop(columns=['Education-Num']).columns:
    print("{}: {}\n".format(column, train[column].unique()))

for column in train.drop(columns=['Education-Num']).columns:
    num = train[column].nunique()
    if num<20:
        print("{}\n".format(train[column].value_counts()))

In [None]:
for column in train.columns:
    missing = column, train[column].isnull().sum()
    if missing[1] == 0: continue
    print(missing)

In [None]:
train = train.fillna(-1)

In [None]:
###### Feature exploration

train['Education-Num'].hist(figsize=(8,6))

train['Education'].value_counts().plot(kind='bar', figsize=(8,6))

#Edukacja
plt.figure(figsize=(15, 5))
sns.countplot(x='Education', data=train);
plt.xticks(rotation=60)

#Edukacja i płeć

plt.figure(figsize=(15, 5))
sns.barplot(x="Education", y="Target_cat", hue='Sex', data=train)
plt.xticks(rotation=90);

#Rasę i płeć.

plt.figure(figsize=(15, 5))
sns.barplot(x="Race", y="Target_cat", hue='Sex', data=train)
plt.xticks(rotation=90);

#kraj pochodzenia oraz płeć.

plt.figure(figsize=(15, 5))
sns.barplot(x="Country", y="Target_cat", hue='Sex', data=train)
plt.xticks(rotation=90);

#Zbadajmy stan cywilny oraz płeć.

plt.figure(figsize=(15, 5))
sns.barplot(x='Martial Status', y="Target_cat", hue='Sex', data=train)
plt.xticks(rotation=90);

#Zbadajmy zawód oraz płeć.

plt.figure(figsize=(15, 5))
sns.barplot(x='Occupation', y="Target_cat", hue='Sex', data=train)
plt.xticks(rotation=90);

#Zbadaj teraz jeszcze inne kombinacje np. zamiast płci sprawdzić rasę.

plt.figure(figsize=(15, 5))
sns.barplot(x='Occupation', y="Target_cat", hue='Race', data=train)
plt.xticks(rotation=90);

#Zobacz, jeszcze jedną wskazówkę co do wizualizacji. Możesz rozbić to na osobne wykresy.

plt.figure(figsize=(20, 5))
g = sns.catplot(x="Occupation", y="Target_cat", col="Sex", data=train, kind="bar")

for ax in g.axes.flatten():
    plt.sca(ax)
    plt.xticks(rotation=90)
    
plt.figure(figsize=(20, 5))
g = sns.catplot(x="Occupation", y="Target_cat", hue="Sex", col='Race', col_wrap=3, data=train, kind="bar")

for ax in g.axes.flatten():
    plt.sca(ax)
    plt.xticks(rotation=90)
    
plt.figure(figsize=(15, 5))
sns.countplot(x='Age', hue='Target', data=train[train['Sex']==0])
plt.xticks(rotation=90);

In [None]:
#Creating new Features
train['Net_Capital'] = train['Capital Gain'] - train['Capital Loss']
train['White_Man'] = (train['Race']=='White') & (train['Sex']==False)
train['High_Earner'] = ((train['Occupation'] == 'Exec-managerial') | (train['Occupation'] == 'Prof-specialty'))
train['United States'] = train['Country']== 'United-States'
train['Husb_Wife'] = ((train['Relationship']=='Husband') | (train['Relationship']=='Wife'))
train['rel_race'] = train['Relationship'] + train['Race']
train['Hard_worker'] = train['Hours per week']>40
train['Normal_worker'] = train['Hours per week']==40
train['Part_time_worker'] = train['Hours per week']<40

train['High_educ'] = ((train['Education']=='Bachelors') | 
                      (train['Education']=='Prof-school') | 
                      (train['Education']=='Masters') | 
                      (train['Education']=='Doctorate'))

train['Associate'] = ((train['Education']=='Assoc-voc') | 
                      (train['Education']=='Assoc-acdm') | 
                      (train['Education']=='Prof-school'))

train['High_school'] = ((train['Education']=='HS-grad') | 
                        (train['Education']=='Some-college'))

train['Low_educ'] = ((train['Education']=='Preschool') | 
                     (train['Education']=='1st-4th') | 
                     (train['Education']=='5th-6th') | 
                     (train['Education']=='7th-8th') | 
                     (train['Education']=='9th') | 
                     (train['Education']=='10th ') |  
                     (train['Education']=='11th') | 
                     (train['Education']=='12th'))

Marriage = train['Martial Status'].copy()

Marriage[Marriage=="Never-married"] = "Never-Married"
Marriage[Marriage=="Married-AF-spouse"] = "Married"
Marriage[Marriage=="Married-civ-spouse"] = "Married"
Marriage[Marriage=="Married-spouse-absent"] = "Married"
Marriage[Marriage=="Separated"] = "Not-Married"
Marriage[Marriage=="Divorced"] = "Not-Married"
Marriage[Marriage=="Widowed"] = "Widowed"

train['Marriage'] = Marriage

In [None]:
# Changing labels
cat_feats = train.select_dtypes(include=[np.object]).columns

for cat_feat in cat_feats:
    train['{0}_cat'.format(cat_feat)] = pd.factorize(train[cat_feat])[0]

In [None]:
feats_all = train.select_dtypes(exclude=[np.object]).drop(columns=["Target_Cat"].columns

In [None]:
#Doing a model for benchmark - Decision Tree Classifier
np.random.seed(2019)

feats = ['Age', 'fnlwgt', 'Education-Num', 'Sex', 
         'Capital Gain', 'Capital Loss', 'Hours per week', 
         'Workclass_cat', 'Martial Status_cat', 
         'Occupation_cat', 'Relationship_cat', 'Race_cat', 
         'Country_cat']

X = train[feats].values
y = train['Target_cat'].values

model = DecisionTreeClassifier(max_depth=10, random_state=2019)
scores = cross_val_score(model, X, y, cv=5)

print("Mean Accuracy: ", np.mean(scores), "Mean Std: ", np.std(scores))
skplt.estimators.plot_learning_curve(model, X, y, cv=5)

In [None]:
#Doing a model for benchmark - Random Forest Classifier
np.random.seed(2019)

feats = ['Age', 'fnlwgt', 'Education-Num', 'Sex', 
         'Capital Gain', 'Capital Loss', 'Hours per week', 
         'Workclass_cat', 'Martial Status_cat', 
         'Occupation_cat', 'Relationship_cat', 'Race_cat', 
         'Country_cat']

X = train[feats].values
y = train['Target_cat'].values

model = RandomForestClassifier(max_depth=10, n_estimators=100, random_state=2019)
scores = cross_val_score(model, X, y, cv=5)

print("Mean Accuracy: ", np.mean(scores), "Mean Std: ", np.std(scores))
skplt.estimators.plot_learning_curve(model, X, y, cv=5)

In [None]:
#Checking Decision TreeClassifier

In [None]:
train.columns

plt.figure(figsize=(15, 5))
sns.barplot(x="Occupation", y="Target_cat", data=train)
plt.xticks(rotation=90);

###### Validating the model

X = train[feats_all].values
y = train['Target_cat'].values


agg_scores=[]
for i in range(1, 21):
    np.random.seed(2019)
    model = RandomForestClassifier(max_depth=i, n_estimators=100)
    scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)
    print("Depth",i , ": ", np.mean(scores), np.std(scores))
    agg_scores.append(np.mean(scores))
plt.plot(agg_scores)


agg_scores=[]
check_depth = range(3, 14)
check_estimators = [150, 200, 250, 300]    
for j in check_estimators:
    print("N: ", j, "\n")
    for i in check_depth:
        np.random.seed(2019)
        model = RandomForestClassifier(max_depth=i, n_estimators=j, random_state=2019)
        scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)
        print("Depth",i , ": ", np.mean(scores), np.std(scores))
        agg_scores.append(np.mean(scores))

agg_scores=[]
check_depth = range(11, 15)
check_estimators = range(20, 130, 10)
for j in check_estimators:
    print("N: ", j)
    for i in check_depth:
        np.random.seed(2019)
        model = RandomForestClassifier(max_depth=i, n_estimators=j, random_state=2019)
        scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)
        print("Depth",i , ": ", np.mean(scores), np.std(scores))
        agg_scores.append(np.mean(scores))

plt.plot(range(10, 25),agg_scores)

def draw_feature_importances(model, features):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(10, 5))
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), model.feature_importances_[indices],
           color="b", align="center")
    plt.xticks(range(X.shape[1]), [ features[x] for x in indices])
    plt.xticks(rotation=90)
    plt.xlim([-1, X.shape[1]])
    plt.show()

model = RandomForestClassifier(max_depth=13, n_estimators=60, random_state=2019)
model.fit(X, y)
draw_feature_importances(model, feats_all)

ori_feats = ['Age', 'fnlwgt', 'Education-Num', 'Sex', 
         'Capital Gain', 'Capital Loss', 'Hours per week', 
         'Workclass_cat', 'Martial Status_cat', 
         'Occupation_cat', 'Relationship_cat', 'Race_cat', 
         'Country_cat']

feats_all = ['Age', 'fnlwgt', 'Education-Num', 'Sex',
             'Capital Gain', 'Capital Loss', 'Hours per week',
             'Net_Capital', 'White_Man', 'High_Earner',
             'Hard_worker', 'Lazy_worker', 'High_educ',
             'Associate', 'Low_educ', 'Workclass_cat', 
             'Martial Status_cat', 'Occupation_cat', 'Relationship_cat', 
             'Race_cat', 'Country_cat','rel_race_cat', 'Marriage_cat'] #'High_school','United States', 'Normal_worker']

feats_sel = ['Age', 'fnlwgt', 'Education-Num', 'Sex',
             'Hours per week', 'Net Capital', 'White Man',
             'United States', 'High_educ', 'Associate', 'High_school', 
             'Hard_worker', 'Normal_worker', 'Workclass_cat',
             'Occupation_cat', 'Country_cat', 'Marriage_cat', 'rel_race_cat']

np.random.seed(2019)

X = train[feats_all].values
y = train['Target_cat'].values

check_depth = range(10, 20)
'''
for i in check_depth:
    model = RandomForestClassifier(max_depth=i, n_estimators=100, random_state=2019)
    scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)
    print("Depth",i , ": ", np.mean(scores), np.std(scores))
    agg_scores.append(np.mean(scores))
'''
model = RandomForestClassifier(max_depth=14, n_estimators=50, random_state=2019)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)  
print("Mean Accuracy: ", np.mean(scores), "Mean Std: ", np.std(scores))
skplt.estimators.plot_learning_curve(model, X, y, cv=5, train_sizes=(26048,))

In [None]:
#Best Score so far
#Mean Accuracy: 0.8629036342359697 Mean Std: 0.002837446246797028
np.random.seed(2019)
feats = ['Age', 'fnlwgt', 'Education-Num', 'Sex',
             'Capital Gain', 'Capital Loss', 'Hours per week',
             'Net_Capital', 'White_Man', 'High_Earner',
             'Hard_worker', 'Normal_worker', 'Lazy_worker', 'High_educ',
             'Associate', 'High_school', 'Low_educ', 'Workclass_cat', 
             'Martial Status_cat', 'Occupation_cat',
             'Relationship_cat', 'Race_cat', 'Country_cat','rel_race_cat', 'Marriage_cat']

X = train[feats].values
y = train['Target_cat'].values

model = RandomForestClassifier(max_depth=13, n_estimators=60, random_state=2019)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)

print("Mean Accuracy:", np.mean(scores), "Mean Std:", np.std(scores))
skplt.estimators.plot_learning_curve(model, X, y, cv=5)

#Previous Best Score
#Mean Accuracy: 0.8624737059617299 Mean Std: 0.003252705277062589
np.random.seed(2019)
feats = ['Age', 'fnlwgt', 'Education-Num', 'Sex',
             'Capital Gain', 'Capital Loss', 'Hours per week',
             'Net_Capital', 'White_Man', 'High_Earner', 'United States',
             'Hard_worker', 'Normal_worker', 'Lazy_worker', 'High_educ',
             'Associate', 'High_school', 'Low_educ', 'Workclass_cat', 
             'Martial Status_cat', 'Occupation_cat',
             'Relationship_cat', 'Race_cat', 'Country_cat','rel_race_cat', 'Marriage_cat']

X = train[feats].values
y = train['Target_cat'].values

model = RandomForestClassifier(max_depth=13, n_estimators=100, random_state=2019)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)

print("Mean Accuracy:", np.mean(scores), "Mean Std:", np.std(scores))
skplt.estimators.plot_learning_curve(model, X, y, cv=5)

#Previous best score
#Mean Accuracy: 0.8618287616790612 Mean Std: 0.003625452310510787
np.random.seed(2019)
feats = ['Age', 'fnlwgt', 'Education-Num', 'Sex', 
         'Net Capital', 'Hours per week', 
         'Workclass_cat', 'Martial Status_cat', 
         'Occupation_cat', 'Relationship_cat', 'Race_cat', 
         'United States']

X = train[feats].values
y = train['Target_cat'].values

model = RandomForestClassifier(max_depth=13, n_estimators=100, random_state=2019)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)

print("Mean Accuracy:", np.mean(scores), "Mean Std:", np.std(scores))
skplt.estimators.plot_learning_curve(model, X, y, cv=5)

#Previous best score
#Mean Accuracy: 0.8610916656575339 Mean Std: 0.0031413965841823946
np.random.seed(2019)
feats = ['Age', 'fnlwgt', 'Education-Num', 'Sex', 
         'Capital Gain', 'Capital Loss', 'Hours per week', 
         'Workclass_cat', 'Martial Status_cat', 
         'Occupation_cat', 'Relationship_cat', 'Race_cat', 
         'Country_cat']

X = train[feats].values
y = train['Target_cat'].values

model = RandomForestClassifier(max_depth=13, n_estimators=100, random_state=2019)
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)

print("Mean Accuracy:", np.mean(scores), "Mean Std:", np.std(scores))
skplt.estimators.plot_learning_curve(model, X, y, cv=5)

#Previous best score
#Mean Accuracy:  0.858112611480875 Mean Std:  0.0029497832108238203
np.random.seed(2019)

feats = ['Age', 'fnlwgt', 'Education-Num', 'Sex', 
         'Capital Gain', 'Capital Loss', 'Hours per week', 
         'Workclass_cat', 'Martial Status_cat', 
         'Occupation_cat', 'Relationship_cat', 'Race_cat', 
         'Country_cat']

X = train[feats].values
y = train['Target_cat'].values

model = RandomForestClassifier(max_depth=10, n_estimators=100, random_state=2019)
scores = cross_val_score(model, X, y, cv=5)

print("Mean Accuracy: ", np.mean(scores), "Mean Std: ", np.std(scores))
skplt.estimators.plot_learning_curve(model, X, y, cv=5)

In [None]:
feats = ['Age', 'fnlwgt', 'Education-Num', 'Sex', 
         'Capital Gain', 'Capital Loss', 'Hours per week', 
         'Workclass_cat', 'Martial Status_cat', 
         'Occupation_cat', 'Relationship_cat', 'Race_cat', 
         'Country_cat']

X = train[feats].values
y = train['Target_cat'].values

model = RandomForestClassifier(max_depth=10, n_estimators=100, random_state=2019)
scores = cross_val_score(model, X, y, cv=5)

print("Mean Accuracy: ", np.mean(scores), "Mean Std: ", np.std(scores))
skplt.estimators.plot_learning_curve(model, X, y, cv=5)