In [9]:
# Setup the codespace
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from termcolor import colored

print(colored('\nAll libraries imported succesfully.', 'green'))

[32m
All libraries imported succesfully.[0m


In [10]:
# import datasets
test_data30 = pd.read_excel('test_data.xlsx')
train_data_control = pd.read_excel('train_data_control.xlsx')
kNN_imputed_10 = pd.read_csv('kNN_imputed_10.csv')
kNN_imputed_40 = pd.read_csv('kNN_imputed_40.csv')
kNN_imputed_70 = pd.read_csv('kNN_imputed_70.csv')

datasets = {
'kNN_imputed_10' : kNN_imputed_10,
'kNN_imputed_40' : kNN_imputed_40,
'kNN_imputed_70' : kNN_imputed_70,
}

print(colored('\nAll datasets imported succesfully.', 'green'))


[32m
All datasets imported succesfully.[0m


In [11]:
#reverse the one hot encoding of the target variable
for dataset_name, dataset in datasets.items():
    # print(dataset_name)
    # display(dataset)
    # map value depending on conditional statement
    dataset['target'] = dataset['target_ <=50K'].map(lambda x: '<=50K' if x == 1.0 else '>50K')
    # Drop the dummy columns
    dataset.drop(columns=['target_ <=50K', 'target_ >50K'], inplace=True)
    # print(dataset_name)
    # display(dataset)

datasets['train_data_control'] = train_data_control
# print(datasets)

print(colored('\nEncoding succesfully reversed.', 'green'))

[32m
Encoding succesfully reversed.[0m


In [19]:
# Functions
def kNN_class(train_data, test_data):
    evaluation = {}
    #train data
    features = [col for col in train_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_train = train_data[features].values.tolist()
    y_train = train_data['target'].values.tolist()
    y_train = [value.strip() for value in y_train]

    #test data
    features = [col for col in test_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_test = test_data[features].values.tolist()
    y_test = test_data['target'].values.tolist()
    y_test = [value.strip() for value in y_test]

    # Determine optimal k
    k_values = [i for i in range (2,10)]
    max_score = 0
    opt_k = 2

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        score = cross_val_score(knn, x_train, y_train, cv=opt_k)
        mean_score = np.mean(score)

        if mean_score > max_score:
            max_score = mean_score
            opt_k = k

    #Set and fit model
    knn = KNeighborsClassifier(n_neighbors=opt_k)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    f1 = f1_score(y_test, y_pred,average='weighted')

    evaluation = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    return evaluation

def DecisionTree_class(train_data, test_data):
    evaluation = {}
    #train data
    features = [col for col in train_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_train = train_data[features].values.tolist()
    y_train = train_data['target'].values.tolist()
    y_train = [value.strip() for value in y_train]
    
    #test data
    features = [col for col in test_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_test = test_data[features].values.tolist()
    y_test = test_data['target'].values.tolist()
    y_test = [value.strip() for value in y_test]

    # Determine optimal tree depth
    depth_values = [i for i in range(2, 20)]
    max_score = 0
    optimal_depth = 2

    for depth in depth_values:
        # print(f'depth: {depth}')
        clf = DecisionTreeClassifier(max_depth=depth)
        score = cross_val_score(clf, x_train, y_train, cv=10)
        # print(f'score: {score}')
        mean_score = np.mean(score)
        # print(f'mean score: {mean_score}')

        if mean_score > max_score:
            max_score = mean_score
            # print(f'max score: {max_score}')
            optimal_depth = depth
            # print(f'opt depth: {optimal_depth}')
    
    # print(f'final depth: {optimal_depth}')

    # final modelling
    dt_clf = DecisionTreeClassifier()
    dt_clf = clf.fit(x_train,y_train)
    y_pred = dt_clf.predict(x_test)

    # performance evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    f1 = f1_score(y_test, y_pred,average='weighted')
 
    evaluation = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
        # 'rocauc': rocauc
    }
    return evaluation

print(colored('\nFunctions sucsesfully created.', 'green'))

[32m
Functions sucsesfully created.[0m


In [17]:
# Test DT depth searcher
DecisionTree_class(train_data_control,test_data30)


depth: 2
score: [0.80394737 0.79912281 0.79552435 0.79991224 0.79815709 0.80035103
 0.8095656  0.79991224 0.79640193 0.8016674 ]
mean score: 0.8004562057843161
max score: 0.8004562057843161
opt depth: 2
depth: 3
score: [0.80789474 0.80921053 0.79727951 0.80824923 0.79815709 0.81790259
 0.80210619 0.80824923 0.79201404 0.80122861]
mean score: 0.8042291748458466
max score: 0.8042291748458466
opt depth: 3
depth: 4
score: [0.82631579 0.81359649 0.81000439 0.81570864 0.81526986 0.8174638
 0.82272927 0.81790259 0.80912681 0.81439228]
mean score: 0.8162509911241465
max score: 0.8162509911241465
opt depth: 4
depth: 5
score: [0.83859649 0.83114035 0.82843352 0.82667837 0.82755595 0.84159719
 0.83194384 0.83282141 0.81790259 0.825362  ]
mean score: 0.8302031708274635
max score: 0.8302031708274635
opt depth: 5
depth: 6
score: [0.83815789 0.83201754 0.8293111  0.82580079 0.82974989 0.8411584
 0.83238262 0.83501536 0.8214129  0.82360685]
mean score: 0.8308613349961125
max score: 0.8308613349961125


{'accuracy': 0.8074521445388474,
 'precision': 0.7971478220235069,
 'recall': 0.8074521445388474,
 'f1': 0.8004348549605133}

In [20]:
# # Model and Evaluation
results_classification = []

# Define a list of classification functions along with their names
classification_model = [
    ('k-NearestNeighbour_classifier', kNN_class),
    ('DecisionTree_classifier', DecisionTree_class)
]

for dataset_name, dataset in datasets.items():
    for model_name, model in classification_model:
        try:
            results = model(dataset, test_data30)
            result_dict = {'dataset_name': dataset_name, 'model': model_name, **results}
            results_classification.append(result_dict)
        except Exception as e:
            error_message = str(e).split('\n')[0]  
            print(f"Error occurred in {model_name} Classification for dataset_name {dataset_name}: {error_message}")

results_classification_df = pd.DataFrame(results_classification)
display(results_classification_df)
results_classification_df.to_excel('classification_model_results2.xlsx', index=False)

Unnamed: 0,dataset_name,model,accuracy,precision,recall,f1
0,kNN_imputed_10,k-NearestNeighbour_classifier,0.806633,0.791143,0.806633,0.792501
1,kNN_imputed_10,DecisionTree_classifier,0.784318,0.783021,0.784318,0.783654
2,kNN_imputed_40,k-NearestNeighbour_classifier,0.566588,0.754949,0.566588,0.594846
3,kNN_imputed_40,DecisionTree_classifier,0.612038,0.755536,0.612038,0.640473
4,kNN_imputed_70,k-NearestNeighbour_classifier,0.261644,0.721854,0.261644,0.145689
5,kNN_imputed_70,DecisionTree_classifier,0.319582,0.696035,0.319582,0.26821
6,train_data_control,k-NearestNeighbour_classifier,0.811137,0.796149,0.811137,0.788795
7,train_data_control,DecisionTree_classifier,0.807759,0.797579,0.807759,0.800848
