Decision Tree
====

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import time
import memory_profiler

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix


# adding header 
headerList = ['age', 'workclass', 'fnlwgt', 'education', 'education-num' , 'marital-status','occupation', 'relationship' , 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','label' ] 

data=pd.read_csv('adult.data',names=headerList,sep=',',skipinitialspace=True)
print(data)

       age         workclass  fnlwgt   education  education-num  \
0       39         State-gov   77516   Bachelors             13   
1       50  Self-emp-not-inc   83311   Bachelors             13   
2       38           Private  215646     HS-grad              9   
3       53           Private  234721        11th              7   
4       28           Private  338409   Bachelors             13   
...    ...               ...     ...         ...            ...   
32556   27           Private  257302  Assoc-acdm             12   
32557   40           Private  154374     HS-grad              9   
32558   58           Private  151910     HS-grad              9   
32559   22           Private  201490     HS-grad              9   
32560   52      Self-emp-inc  287927     HS-grad              9   

           marital-status         occupation   relationship   race     sex  \
0           Never-married       Adm-clerical  Not-in-family  White    Male   
1      Married-civ-spouse    Exec-manag

#DATA preprocessing
==================================================================================================================================

In [2]:
#Remove duplicate data 
data = data.drop_duplicates()

In [3]:
#change value '?' to mode value of that column
data[data == '?'] = np.nan
colWithMissingVal = ['workclass', 'occupation', 'native-country']

for column in colWithMissingVal:
    data[column].fillna(data[column].mode()[0], inplace=True)

In [4]:
#Cap outliers : cap age to 99 percentile
p_age = np.percentile(data['age'], 99)
print(p_age)

data['age'] = np.where(data['age'] > np.percentile(data['age'], 99), np.percentile(data['age'], 99), data['age'])

74.0


In [5]:
#Trim outliers: for 'education-num' and 'hours-per-week' column: only keep rows in dataframe with all z-scores less than absolute value of 3 

data = data[(np.abs(stats.zscore(data['education-num'])) < 3)]
data = data[(np.abs(stats.zscore(data['hours-per-week'])) < 3)]

In [6]:
#Convert label(income) column :'<=50K': 0, '>50K': 1}
data['label'] = data['label'].map({'<=50K': 0, '>50K': 1})

# Convert age and education.num to integer data type
data['age'] = data['age'].astype(int)
data['education-num'] = data['education-num'].astype(int)
data['hours-per-week'] = data['hours-per-week'].astype(int)

In [7]:
#for categorical Data, use LabelEncoder to transform these non-numerical labels.
categoricalData = ['workclass','marital-status', 'occupation', 'relationship', 'race', 'sex']
label_encoder = LabelEncoder()

for i in categoricalData:
    data[i] = label_encoder.fit_transform(data[i])

In [8]:
# Separate features (X) and target variable (y)
X = data.drop(['label',  'education','native-country'], axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Building Decision Tree
==

In [9]:
# Create a Decision Tree Classifier with hyperparameter tuning

#load memory_profiler to calculate memory usage
%load_ext memory_profiler  
#%reload_ext memory_profiler

#state parameters grid: input values for each hyperparameter to be test
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

#=======================================================================================

#record start time
start_time = time.time()

%memit clf = DecisionTreeClassifier(random_state=42)
%memit grid_search = GridSearchCV(clf, param_grid, cv=5)
%memit grid_search.fit(X_train, y_train)

#time taken performing tuning
tuning_time = time.time()

%memit best_clf = grid_search.best_estimator_
%memit best_clf.fit(X_train, y_train)

#record end time
end_time = time.time()

#=======================================================================================

total_tuning_time = tuning_time - start_time #Calculate time for hyperparameter tuning
model_training_time = end_time - tuning_time #Calculate time for train the model 
total_time = end_time - start_time  # Calculate total time

print('=======================================================================================')
print("Total time taken to tune the hyperparameter: {:0.3f} seconds".format(total_tuning_time))
print("Total time taken to train the model: {:0.3f} seconds".format(model_training_time))
print("Total time taken to build the Decision Tree model: {:0.3f} seconds".format(total_time))

print("Hyperparameter used: {}".format(grid_search.best_params_))


peak memory: 211.00 MiB, increment: 0.07 MiB
peak memory: 211.00 MiB, increment: 0.00 MiB
peak memory: 216.58 MiB, increment: 5.58 MiB
peak memory: 214.80 MiB, increment: 0.00 MiB
peak memory: 214.80 MiB, increment: 0.00 MiB
Total time taken to tune the hyperparameter: 37.678 seconds
Total time taken to train the model: 2.114 seconds
Total time taken to build the Decision Tree model: 39.791 seconds
Hyperparameter used: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [10]:
y_pred = best_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

(0.8474204171240395,
 0.7065693430656934,
 0.6289798570500325,
 0.6655207975249225)

In [11]:
# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90      4838
           1       0.71      0.63      0.67      1539

    accuracy                           0.85      6377
   macro avg       0.80      0.77      0.78      6377
weighted avg       0.84      0.85      0.84      6377



#Test using adult.test dataset
=

In [12]:
testData=pd.read_csv('adult.test',names=headerList,sep=',',skipinitialspace=True)
#testData.info()
print(testData)

testData = testData.iloc[1: , :]
print(testData)

                        age     workclass    fnlwgt     education  \
0      |1x3 Cross validator           NaN       NaN           NaN   
1                        25       Private  226802.0          11th   
2                        38       Private   89814.0       HS-grad   
3                        28     Local-gov  336951.0    Assoc-acdm   
4                        44       Private  160323.0  Some-college   
...                     ...           ...       ...           ...   
16277                    39       Private  215419.0     Bachelors   
16278                    64             ?  321403.0       HS-grad   
16279                    38       Private  374983.0     Bachelors   
16280                    44       Private   83891.0     Bachelors   
16281                    35  Self-emp-inc  182148.0     Bachelors   

       education-num      marital-status         occupation    relationship  \
0                NaN                 NaN                NaN             NaN   
1            

In [13]:
testData['label'] = testData['label'].map({'<=50K.': 0, '>50K.': 1})

# Convert age and education.num to integer data type
testData['age'] = testData['age'].astype(int)
testData['education-num'] = testData['education-num'].astype(int)
testData['hours-per-week'] = testData['hours-per-week'].astype(int)

In [14]:
categoricalData = ['workclass','marital-status', 'occupation', 'relationship', 'race', 'sex']
label_encoder = LabelEncoder()

for i in categoricalData:
    testData[i] = label_encoder.fit_transform(testData[i])

print(testData)

       age  workclass    fnlwgt     education  education-num  marital-status  \
1       25          4  226802.0          11th              7               4   
2       38          4   89814.0       HS-grad              9               2   
3       28          2  336951.0    Assoc-acdm             12               2   
4       44          4  160323.0  Some-college             10               2   
5       18          0  103497.0  Some-college             10               4   
...    ...        ...       ...           ...            ...             ...   
16277   39          4  215419.0     Bachelors             13               0   
16278   64          0  321403.0       HS-grad              9               6   
16279   38          4  374983.0     Bachelors             13               2   
16280   44          4   83891.0     Bachelors             13               0   
16281   35          5  182148.0     Bachelors             13               2   

       occupation  relationship  race  

In [15]:
# Select features of test set
test_dataset_log = testData.drop(['label', 'education', 'native-country'], axis=1)
test_dataset_result = testData['label']

# Using trained SMV model to predict outcome
y_pred_test = best_clf.predict(test_dataset_log)

In [16]:
# Evaluate the model
accuracy = accuracy_score(test_dataset_result, y_pred_test)
precision = precision_score(test_dataset_result, y_pred_test)
recall = recall_score(test_dataset_result, y_pred_test)
f1 = f1_score(test_dataset_result, y_pred_test)

accuracy, precision, recall, f1

(0.8514219028315214, 0.7102858826996759, 0.6266250650026001, 0.665837822903716)