In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')

In [2]:
name_cols = []
with open('adult.names') as file:
    for f in file.readlines():
        if not str(f).startswith('|') and ':' in str(f):
            name_cols.append(str(f).split(':')[0])
name_cols.append('Salary')

In [3]:
df = pd.read_csv('adult.data', names=name_cols)

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Salary
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [5]:
object_cols = []
for i, enum in enumerate(df.dtypes):
    if enum=='object':
        object_cols.append(i)

for i in range(len(object_cols)):
    object_cols[i] = df.dtypes.index[object_cols[i]]
    
def correct_names(name):
    if name.startswith(" ") or name.endswith(" "):
        return name.strip(" ")
    else:
        return name
    
for i in object_cols:
    df[i] = df[i].apply(correct_names)

In [7]:
object_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'Salary']

In [6]:
native_country_new = df[df['Salary']==">50K"]['native-country'].value_counts().index[:10]
def get_country(name):
    if name not in native_country_new or name=='?':
        return "Others"
    else:
        return name

df['native-country'] = df['native-country'].apply(get_country)

In [7]:
def change_edu_level(name):
    if name=="HS-grad":
        return "High School"
    elif name in ["Bachelors","Some-college"]:
        return "Bachelors"
    elif name in ["11th", "9th", "7th-8th", "5th-6th", "10th", "1st-4th", "12th", "Preschool", "compulsory"]:
        return "Compulsory"
    elif name in ["Assoc-acdm", "Assoc-voc"]:
        return "Associate"
    else:
        return name

df['education'] = df['education'].apply(change_edu_level)

In [10]:
df['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [11]:
df['marital-status'].value_counts()

Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: marital-status, dtype: int64

In [12]:
df['relationship'].value_counts()

Husband           13193
Not-in-family      8305
Own-child          5068
Unmarried          3446
Wife               1568
Other-relative      981
Name: relationship, dtype: int64

In [13]:
df[df['marital-status']=='Never-married']['relationship'].value_counts()

Not-in-family     4706
Own-child         4485
Unmarried          881
Other-relative     611
Name: relationship, dtype: int64

In [8]:
def get_mar(name):
    if name in ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse']:
        return "Married"
    elif name in ['Divorced', 'Separated']:
        return "Divorced"
    else:
        return name

df['marital-status'] = df['marital-status'].apply(get_mar)

In [15]:
for i in object_cols:
    if '?' in df[i].value_counts():
        print(i)

workclass
occupation


In [9]:
def remove_qm(name):
    if name == '?':
        return 'Other-service'
    else:
        return name
    
df['workclass'] = df['workclass'].apply(remove_qm)
df['occupation'] = df['occupation'].apply(remove_qm)

In [17]:
df['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
Other-service        1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [10]:
def get_work(name):
    if name in ['Local-gov', 'State-gov', 'Federal-gov']:
        return 'Government'
    elif name in ['Without-pay', 'Never-worked']:
        return 'Other-service'
    else:
        return name

df['workclass'] = df['workclass'].apply(get_work)

In [20]:
df['occupation'].value_counts()

Other-service        5138
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64

In [11]:
workclass_ = pd.get_dummies(df['workclass'])
marital_ = pd.get_dummies(df['marital-status'])
occupation_ = pd.get_dummies(df['occupation'])
relationship_ = pd.get_dummies(df['relationship'])
race_ = pd.get_dummies(df['race'])
sex_ = pd.get_dummies(df['sex'])
country_ = pd.get_dummies(df['native-country'])
edn_ = pd.get_dummies(df['education'])

In [12]:
X = pd.concat([df[['age','capital-gain', 'hours-per-week']], workclass_ , marital_, occupation_, relationship_, race_, sex_, edn_, country_ ], axis=1)
y = df['Salary']

In [24]:
X.shape

(32561, 56)

In [13]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_temp = X.copy()
X_temp['age'] = std.fit_transform(X_temp[['age']])

In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=15)
X_temp = pca.fit_transform(X_temp)

In [27]:
X_temp.shape

(32561, 15)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y, test_size=0.05, random_state=42)

In [29]:
from sklearn.ensemble import AdaBoostClassifier

In [20]:
ada = AdaBoostClassifier(n_estimators=200)

In [21]:
ada.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=200)

In [22]:
pred1_ = ada.predict(X_val)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [24]:
print(confusion_matrix(y_val, pred1_), "\n")
print(classification_report(y_val, pred1_), "\n")
print(accuracy_score(y_val, pred1_))

[[1105  125]
 [ 171  228]] 

              precision    recall  f1-score   support

       <=50K       0.87      0.90      0.88      1230
        >50K       0.65      0.57      0.61       399

    accuracy                           0.82      1629
   macro avg       0.76      0.73      0.74      1629
weighted avg       0.81      0.82      0.81      1629
 

0.8182934315531001


In [42]:
ada1 = AdaBoostClassifier(n_estimators=1000, learning_rate=0.1)
ada1.fit(X_train, y_train)
pred2 = ada1.predict(X_val)
print(confusion_matrix(y_val, pred2), "\n")
print(classification_report(y_val, pred2), "\n")
print(accuracy_score(y_val, pred2))

[[1111  119]
 [ 172  227]] 

              precision    recall  f1-score   support

       <=50K       0.87      0.90      0.88      1230
        >50K       0.66      0.57      0.61       399

    accuracy                           0.82      1629
   macro avg       0.76      0.74      0.75      1629
weighted avg       0.81      0.82      0.82      1629
 

0.8213627992633518


In [43]:
pred = ada1.predict(X_test)

In [44]:
print(confusion_matrix(y_test, pred), "\n")
print(classification_report(y_test, pred), "\n")
print(accuracy_score(y_test, pred))

[[7494  702]
 [1107 1443]] 

              precision    recall  f1-score   support

       <=50K       0.87      0.91      0.89      8196
        >50K       0.67      0.57      0.61      2550

    accuracy                           0.83     10746
   macro avg       0.77      0.74      0.75     10746
weighted avg       0.82      0.83      0.83     10746
 

0.8316582914572864


In [45]:
ada2 = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)
ada2.fit(X_train, y_train)
pred3 = ada2.predict(X_val)
print(confusion_matrix(y_val, pred3), "\n")
print(classification_report(y_val, pred3), "\n")
print(accuracy_score(y_val, pred3))

[[1115  115]
 [ 181  218]] 

              precision    recall  f1-score   support

       <=50K       0.86      0.91      0.88      1230
        >50K       0.65      0.55      0.60       399

    accuracy                           0.82      1629
   macro avg       0.76      0.73      0.74      1629
weighted avg       0.81      0.82      0.81      1629
 

0.8182934315531001


In [34]:
ada1 = AdaBoostClassifier(n_estimators=1000)
ada1.fit(X_train, y_train)
pred2 = ada1.predict(X_val)

In [32]:
print(confusion_matrix(y_val, pred2), "\n")
print(classification_report(y_val, pred2), "\n")
print(accuracy_score(y_val, pred2))

[[1137   93]
 [ 161  238]] 

              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      1230
        >50K       0.72      0.60      0.65       399

    accuracy                           0.84      1629
   macro avg       0.80      0.76      0.78      1629
weighted avg       0.84      0.84      0.84      1629
 

0.8440761203192142


In [35]:
print(confusion_matrix(y_val, pred2), "\n")
print(classification_report(y_val, pred2), "\n")
print(accuracy_score(y_val, pred2))

[[1124  106]
 [ 146  253]] 

              precision    recall  f1-score   support

       <=50K       0.89      0.91      0.90      1230
        >50K       0.70      0.63      0.67       399

    accuracy                           0.85      1629
   macro avg       0.79      0.77      0.78      1629
weighted avg       0.84      0.85      0.84      1629
 

0.8453038674033149


In [33]:
pred = ada1.predict(X_test)
print(confusion_matrix(y_test, pred), "\n")
print(classification_report(y_test, pred), "\n")
print(accuracy_score(y_test, pred))

[[7625  571]
 [1031 1519]] 

              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90      8196
        >50K       0.73      0.60      0.65      2550

    accuracy                           0.85     10746
   macro avg       0.80      0.76      0.78     10746
weighted avg       0.84      0.85      0.85     10746
 

0.8509212730318257


In [36]:
pred = ada1.predict(X_test)
print(confusion_matrix(y_test, pred), "\n")
print(classification_report(y_test, pred), "\n")
print(accuracy_score(y_test, pred))

[[7618  578]
 [ 915 1635]] 

              precision    recall  f1-score   support

       <=50K       0.89      0.93      0.91      8196
        >50K       0.74      0.64      0.69      2550

    accuracy                           0.86     10746
   macro avg       0.82      0.79      0.80     10746
weighted avg       0.86      0.86      0.86     10746
 

0.8610645821701098


In [17]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=70)

In [34]:
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, n_estimators=200)

In [20]:
pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, pred_rfc), "\n")
print(classification_report(y_test, pred_rfc), "\n")
print(accuracy_score(y_test, pred_rfc))

# 1000 = n_estim, max_depth= 15

[[7917  279]
 [ 560 1990]] 

              precision    recall  f1-score   support

       <=50K       0.93      0.97      0.95      8196
        >50K       0.88      0.78      0.83      2550

    accuracy                           0.92     10746
   macro avg       0.91      0.87      0.89     10746
weighted avg       0.92      0.92      0.92     10746
 

0.9219244369998139


In [23]:
pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, pred_rfc), "\n")
print(classification_report(y_test, pred_rfc), "\n")
print(accuracy_score(y_test, pred_rfc))

# 5000 = n_estim, max_depth = 15

[[7915  281]
 [ 542 2008]] 

              precision    recall  f1-score   support

       <=50K       0.94      0.97      0.95      8196
        >50K       0.88      0.79      0.83      2550

    accuracy                           0.92     10746
   macro avg       0.91      0.88      0.89     10746
weighted avg       0.92      0.92      0.92     10746
 

0.9234133631118556


In [26]:
pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, pred_rfc), "\n")
print(classification_report(y_test, pred_rfc), "\n")
print(accuracy_score(y_test, pred_rfc))

# 200 = n_estim, max_depth = 15

[[7907  289]
 [ 565 1985]] 

              precision    recall  f1-score   support

       <=50K       0.93      0.96      0.95      8196
        >50K       0.87      0.78      0.82      2550

    accuracy                           0.92     10746
   macro avg       0.90      0.87      0.89     10746
weighted avg       0.92      0.92      0.92     10746
 

0.9205285687697748


In [29]:
pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, pred_rfc), "\n")
print(classification_report(y_test, pred_rfc), "\n")
print(accuracy_score(y_test, pred_rfc))

# 200 = n_estim, max_depth = 20

[[7948  248]
 [ 340 2210]] 

              precision    recall  f1-score   support

       <=50K       0.96      0.97      0.96      8196
        >50K       0.90      0.87      0.88      2550

    accuracy                           0.95     10746
   macro avg       0.93      0.92      0.92     10746
weighted avg       0.94      0.95      0.94     10746
 

0.9452819653824679


In [32]:
pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, pred_rfc), "\n")
print(classification_report(y_test, pred_rfc), "\n")
print(accuracy_score(y_test, pred_rfc))

# 200 = n_estim, max_depth = 35

[[7954  242]
 [ 314 2236]] 

              precision    recall  f1-score   support

       <=50K       0.96      0.97      0.97      8196
        >50K       0.90      0.88      0.89      2550

    accuracy                           0.95     10746
   macro avg       0.93      0.92      0.93     10746
weighted avg       0.95      0.95      0.95     10746
 

0.9482598176065513


In [35]:
pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, pred_rfc), "\n")
print(classification_report(y_test, pred_rfc), "\n")
print(accuracy_score(y_test, pred_rfc))

# 200 = n_estim, max_depth = 70

[[7966  230]
 [ 329 2221]] 

              precision    recall  f1-score   support

       <=50K       0.96      0.97      0.97      8196
        >50K       0.91      0.87      0.89      2550

    accuracy                           0.95     10746
   macro avg       0.93      0.92      0.93     10746
weighted avg       0.95      0.95      0.95     10746
 

0.9479806439605435
