In [1]:
path = "https://raw.githubusercontent.com/shobhit-nigam/knowledgeclan/main/datasets/income.csv"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier

In [3]:
dfa = pd.read_csv(path)

In [4]:
str_cols = ['workclass' , 'education',
  'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country',
 'high_income']

In [5]:
for name in str_cols:
    col = pd.Categorical(dfa[name])
    dfa[name] = col.codes

In [6]:
list_cols = list(dfa.columns)
print(list_cols)

['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'high_income']


In [7]:
list_cols = ['age', 'workclass', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'native_country']


In [8]:
dfa.sample(frac=1, random_state =1)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
9646,62,6,26911,5,4,6,8,1,4,0,0,0,66,39,0
709,18,4,208103,1,7,4,8,2,4,1,0,0,25,39,0
7385,25,4,102476,9,13,4,5,3,4,1,27828,0,50,39,1
16671,33,4,511517,11,9,2,10,0,4,1,0,0,40,39,0
21932,36,4,292570,1,7,4,7,4,4,0,0,0,40,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32511,25,2,514716,9,13,4,1,3,2,0,0,0,40,39,0
5192,32,4,207668,9,13,2,4,0,4,1,15024,0,45,39,1
12172,27,4,104457,9,13,4,7,1,1,1,0,0,40,0,0
235,59,7,268700,11,9,2,8,0,4,1,0,0,40,39,0


In [9]:
train_max_row = math.floor(dfa.shape[0] *0.8)
train = dfa.iloc[:train_max_row]
test = dfa.iloc[train_max_row:]

In [11]:
model_a = DecisionTreeClassifier(random_state = 1, min_samples_leaf = 2)
model_a.fit(train[list_cols], train['high_income'])

test_predictions_a = model_a.predict(test[list_cols])
test_auc_a = roc_auc_score(test["high_income"], test_predictions_a)

print(test_auc_a)

0.6948258446977408


In [12]:
model_b = DecisionTreeClassifier(random_state = 1, max_depth = 5)
model_b.fit(train[list_cols], train['high_income'])

test_predictions_b = model_b.predict(test[list_cols])
test_auc_b = roc_auc_score(test["high_income"], test_predictions_b)

print(test_auc_b)


0.6837000050885407


In [13]:
# Ensembling

In [14]:
model_a = DecisionTreeClassifier(random_state = 1, min_samples_leaf = 2)
model_a.fit(train[list_cols], train['high_income'])

test_predictions_a = model_a.predict_proba(test[list_cols])


In [15]:
model_b = DecisionTreeClassifier(random_state = 1, max_depth = 5)
model_b.fit(train[list_cols], train['high_income'])

test_predictions_b = model_b.predict_proba(test[list_cols])



In [16]:
test_predictions_a

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       ...,
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.33333333, 0.66666667]])

In [17]:
test_predictions_b

array([[0.98861566, 0.01138434],
       [0.98861566, 0.01138434],
       [0.71210762, 0.28789238],
       ...,
       [0.94363891, 0.05636109],
       [0.98861566, 0.01138434],
       [0.58131488, 0.41868512]])

In [18]:
combination = np.round((test_predictions_a[:, 1] + test_predictions_b[:, 1])/2)

In [19]:
combination

array([0., 0., 0., ..., 0., 0., 1.])

In [20]:
combination_auc = roc_auc_score(test["high_income"], combination)

In [21]:
combination_auc

0.7232501144921636

In [22]:
# randomness in random forest:
# Bagging
# Random feature subset. 

In [23]:
# Bagging
# put random samlpe of data in a bag

In [24]:
tree_count = 10
bag_ratio = 6/10   #(60 % of the original rows)
predicitons = []

for i in range(tree_count):
    bag = train.sample(frac=bag_ratio, replace = True, random_state = i)
    
    model = DecisionTreeClassifier(min_samples_leaf = 2, random_state = 1)
    model.fit(bag[list_cols], bag["high_income"])
    
    predicitons.append(model.predict_proba(test[list_cols])[:, 1])
    combination = np.round((np.sum(predicitons, axis = 0))/(len(predicitons)))

print(roc_auc_score(test["high_income"], combination))

0.7391948020557705


In [25]:
predicitons[0]

array([0., 0., 0., ..., 0., 0., 0.])