In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, RandomForestRegressor
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


#loading the data from https://archive.ics.uci.edu/dataset/20/census+income
col_names = ['age', 'workclass', 'fnlwgt','education', 'education-num', 
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain','capital-loss', 'hours-per-week','native-country', 'income']
df = pd.read_csv('Predicting_income_data.csv', header=None, names = col_names)
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].str.strip()


In [4]:
# how is income distributed? 
income_values = df.income.value_counts(normalize = True)
income_values

income
<=50K    0.75919
>50K     0.24081
Name: proportion, dtype: float64

In [7]:
# defining x, y 
feature_cols = ['age',
       'capital-gain', 'capital-loss', 'hours-per-week', 'sex','race']

X = pd.get_dummies(df[feature_cols], drop_first = True)
X.head
y = np.where(df.income=='<=50K', 0, 1)
X.head

<bound method NDFrame.head of        age  capital-gain  capital-loss  hours-per-week  sex_Male  \
0       39          2174             0              40      True   
1       50             0             0              13      True   
2       38             0             0              40      True   
3       53             0             0              40      True   
4       28             0             0              40     False   
...    ...           ...           ...             ...       ...   
32556   27             0             0              38     False   
32557   40             0             0              40      True   
32558   58             0             0              40     False   
32559   22             0             0              20      True   
32560   52         15024             0              40     False   

       race_Asian-Pac-Islander  race_Black  race_Other  race_White  
0                        False       False       False        True  
1              

In [9]:
# splitting the data and applying the model 
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state = 1)
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc.predict(x_test)
print(f'Accuracy score for default random forest: {round(rfc.score(x_test, y_test)*100,3)}%')

Accuracy score for default random forest: 81.729%


In [11]:
accuracy_train=[]
accuracy_test = []


In [None]:
# what is the best max_depth for the model?
np.random.seed(0)

depths = range(1,26)
for i in depths:
    rfc = RandomForestClassifier(max_depth=i)
    rfc.fit(x_train, y_train)
    y_pred = rfc.predict(x_test)
    accuracy_test.append(accuracy_score(y_test, rfc.predict(x_test)))
    accuracy_train.append(accuracy_score(y_train, rfc.predict(x_train)))
    

In [14]:
best_acc= np.max(accuracy_test)
best_depth = depths[np.argmax(accuracy_test)]
print(f'The highest accuracy on the test is achieved when depth: {best_depth}')
print(f'The highest accuracy on the test set is: {round(best_acc*100,3)}%')

The highest accuracy on the test is achieved when depth: 12
The highest accuracy on the test set is: 83.464%


In [23]:
# fitting the best depth to the model 
best_rfc = RandomForestClassifier(max_depth=best_depth)
best_rfc.fit(x_train, y_train)


# what are the most relevant features?
feature_imp_df = pd.DataFrame(zip(x_train.columns, best_rfc.feature_importances_),  columns=['feature', 'importance'])
print('Top 5 random forest features:')
print(feature_imp_df.sort_values('importance', ascending=False).iloc[0:5])


Top 5 random forest features:
          feature  importance
1    capital-gain    0.370119
0             age    0.248969
3  hours-per-week    0.140480
2    capital-loss    0.140439
4        sex_Male    0.078258
