In [1]:
# Setup the codespace
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from termcolor import colored

print(colored('\nAll libraries imported succesfully.', 'green'))

[32m
All libraries imported succesfully.[0m


In [2]:
# import datasets
test_data30 = pd.read_excel('test_data.xlsx')
train_data_control = pd.read_excel('train_data_control.xlsx')
kNN_imputed_10 = pd.read_csv('kNN_imputed_10.csv')
kNN_imputed_40 = pd.read_csv('kNN_imputed_40.csv')
kNN_imputed_70 = pd.read_csv('kNN_imputed_70.csv')

datasets = {
'kNN_imputed_10' : kNN_imputed_10,
'kNN_imputed_40' : kNN_imputed_40,
'kNN_imputed_70' : kNN_imputed_70,
}


    


In [3]:
#reverse the one hot encoding of the target variable
for dataset_name, dataset in datasets.items():
    # print(dataset_name)
    # display(dataset)
    # map value depending on conditional statement
    dataset['target'] = dataset['target_ <=50K'].map(lambda x: '<=50K' if x == 1.0 else '>50K')
    # Drop the dummy columns
    dataset.drop(columns=['target_ <=50K', 'target_ >50K'], inplace=True)
    # print(dataset_name)
    # display(dataset)

datasets['train_data_control'] = train_data_control
# print(datasets)


In [7]:
# Functions
def kNN_class(train_data, test_data):
    evaluation = {}
    #train data
    features = [col for col in train_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_train = train_data[features].values.tolist()
    y_train = train_data['target'].values.tolist()
    y_train = [value.strip() for value in y_train]

    #test data
    features = [col for col in test_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_test = test_data[features].values.tolist()
    y_test = test_data['target'].values.tolist()
    y_test = [value.strip() for value in y_test]

    # Determine optimal k
    k_values = [i for i in range (2,10)]
    max_score = 0
    opt_k = 2

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        score = cross_val_score(knn, x_train, y_train, cv=opt_k)
        mean_score = np.mean(score)

        if mean_score > max_score:
            max_score = mean_score
            opt_k = k

    #Set and fit model
    knn = KNeighborsClassifier(n_neighbors=opt_k)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    f1 = f1_score(y_test, y_pred,average='weighted')

    evaluation = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    return evaluation

def DecisionTree_class(train_data, test_data):
    evaluation = {}
    #train data
    features = [col for col in train_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_train = train_data[features].values.tolist()
    y_train = train_data['target'].values.tolist()
    y_train = [value.strip() for value in y_train]
    
    #test data
    features = [col for col in test_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_test = test_data[features].values.tolist()
    y_test = test_data['target'].values.tolist()
    y_test = [value.strip() for value in y_test]

    # Determine optimal tree depth
    depth_values = [i for i in range(2, 10)]
    max_score = 0
    optimal_depth = 2

    for depth in depth_values:
        clf = DecisionTreeClassifier(max_depth=depth)
        score = cross_val_score(clf, x_train, y_train, cv=5)  # You can adjust the number of folds in CV
        mean_score = np.mean(score)

        if mean_score > max_score:
            max_score = mean_score
            optimal_depth = depth

    dt_clf = DecisionTreeClassifier()
    dt_clf = clf.fit(x_train,y_train)
    y_pred = dt_clf.predict(x_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    f1 = f1_score(y_test, y_pred,average='weighted')
 
    evaluation = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
        # 'rocauc': rocauc
    }
    return evaluation



In [11]:
# # Model and Evaluation
results_classification = []

# Define a list of classification functions along with their names
classification_model = [
    ('k-NearestNeighbour_classifier', kNN_class),
    ('DecisionTree_classifier', DecisionTree_class)
]

for dataset_name, dataset in datasets.items():
    for model_name, model in classification_model:
        try:
            results = model(dataset, test_data30)
            result_dict = {'dataset_name': dataset_name, 'model': model_name, **results}
            results_classification.append(result_dict)
        except Exception as e:
            error_message = str(e).split('\n')[0]  
            print(f"Error occurred in {model_name} Classification for dataset_name {dataset_name}: {error_message}")

results_classification_df = pd.DataFrame(results_classification)
display(results_classification_df)
results_classification_df.to_excel('classification_model_results.xlsx', index=False)

Unnamed: 0,dataset_name,model,accuracy,precision,recall,f1
0,kNN_imputed_10,k-NearestNeighbour_classifier,0.806633,0.791143,0.806633,0.792501
1,kNN_imputed_10,DecisionTree_classifier,0.70908,0.74608,0.70908,0.722647
2,kNN_imputed_40,k-NearestNeighbour_classifier,0.566588,0.754949,0.566588,0.594846
3,kNN_imputed_40,DecisionTree_classifier,0.546115,0.722845,0.546115,0.576737
4,kNN_imputed_70,k-NearestNeighbour_classifier,0.261644,0.721854,0.261644,0.145689
5,kNN_imputed_70,DecisionTree_classifier,0.390828,0.684323,0.390828,0.391609
6,train_data_control,k-NearestNeighbour_classifier,0.811137,0.796149,0.811137,0.788795
7,train_data_control,DecisionTree_classifier,0.769987,0.773763,0.769987,0.771774
