In [1]:
# Setup the codespace
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from termcolor import colored

print(colored('\nAll libraries imported succesfully.', 'green'))

[32m
All libraries imported succesfully.[0m


In [2]:
# import datasets
test_data30 = pd.read_excel('test_data.xlsx')
train_data_control = pd.read_excel('train_data_control.xlsx')
mean_imputed_data_10 = pd.read_csv('mean_imputed_data_10.csv')
mean_imputed_data_40 = pd.read_csv('mean_imputed_data_40.csv')
mean_imputed_data_70 = pd.read_csv('mean_imputed_data_70.csv')

datasets = {
'mean_imputed_data_10' : mean_imputed_data_10,
'mean_imputed_data_40' : mean_imputed_data_40,
'mean_imputed_data_70' : mean_imputed_data_70,
}

print(colored('\nAll datasets imported succesfully.', 'green'))


[32m
All datasets imported succesfully.[0m


In [4]:
#this step is not needed for the mean_imputed data

#reverse the one hot encoding of the target variable
# for dataset_name, dataset in datasets.items():
#     # print(dataset_name)
#     # display(dataset)
#     # map value depending on conditional statement
#     dataset['target'] = dataset['target_ <=50K'].map(lambda x: '<=50K' if x == 1.0 else '>50K')
#     # Drop the dummy columns
#     dataset.drop(columns=['target_ <=50K', 'target_ >50K'], inplace=True)
#     # print(dataset_name)
#     # display(dataset)

# datasets['train_data_control'] = train_data_control
# # print(datasets)

# print(colored('\nEncoding succesfully reversed.', 'green'))

KeyError: 'target_<=50K'

In [5]:
# Functions
def kNN_class(train_data, test_data):
    evaluation = {}
    #train data
    features = [col for col in train_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_train = train_data[features].values.tolist()
    y_train = train_data['target'].values.tolist()
    y_train = [value.strip() for value in y_train]

    #test data
    features = [col for col in test_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_test = test_data[features].values.tolist()
    y_test = test_data['target'].values.tolist()
    y_test = [value.strip() for value in y_test]

    # Determine optimal k
    k_values = [i for i in range (2,10)]
    max_score = 0
    opt_k = 2

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        score = cross_val_score(knn, x_train, y_train, cv=opt_k)
        mean_score = np.mean(score)

        if mean_score > max_score:
            max_score = mean_score
            opt_k = k

    #Set and fit model
    knn = KNeighborsClassifier(n_neighbors=opt_k)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    f1 = f1_score(y_test, y_pred,average='weighted')

    evaluation = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    return evaluation

def DecisionTree_class(train_data, test_data):
    evaluation = {}
    #train data
    features = [col for col in train_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_train = train_data[features].values.tolist()
    y_train = train_data['target'].values.tolist()
    y_train = [value.strip() for value in y_train]
    
    #test data
    features = [col for col in test_data.columns if col != 'target' and col != 'ID']
    # print(features)
    x_test = test_data[features].values.tolist()
    y_test = test_data['target'].values.tolist()
    y_test = [value.strip() for value in y_test]

    # Determine optimal tree depth
    depth_values = [i for i in range(2, 20)]
    max_score = 0
    optimal_depth = 2

    for depth in depth_values:
        # print(f'depth: {depth}')
        clf = DecisionTreeClassifier(max_depth=depth)
        score = cross_val_score(clf, x_train, y_train, cv=10)
        # print(f'score: {score}')
        mean_score = np.mean(score)
        # print(f'mean score: {mean_score}')

        if mean_score > max_score:
            max_score = mean_score
            # print(f'max score: {max_score}')
            optimal_depth = depth
            # print(f'opt depth: {optimal_depth}')
    
    # print(f'final depth: {optimal_depth}')

    # final modelling
    dt_clf = DecisionTreeClassifier()
    dt_clf = clf.fit(x_train,y_train)
    y_pred = dt_clf.predict(x_test)

    # performance evaluation
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    f1 = f1_score(y_test, y_pred,average='weighted')
 
    evaluation = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
        # 'rocauc': rocauc
    }
    return evaluation

print(colored('\nFunctions sucsesfully created.', 'green'))

[32m
Functions sucsesfully created.[0m


In [None]:
# Test DT depth searcher
DecisionTree_class(train_data_control,test_data30)


In [7]:
# # Model and Evaluation
datasets['train_data_control'] = train_data_control
print(datasets)
results_classification = []

# Define a list of classification functions along with their names
classification_model = [
    ('k-NearestNeighbour_classifier', kNN_class),
    ('DecisionTree_classifier', DecisionTree_class)
]

for dataset_name, dataset in datasets.items():
    for model_name, model in classification_model:
        try:
            results = model(dataset, test_data30)
            result_dict = {'dataset_name': dataset_name, 'model': model_name, **results}
            results_classification.append(result_dict)
        except Exception as e:
            error_message = str(e).split('\n')[0]  
            print(f"Error occurred in {model_name} Classification for dataset_name {dataset_name}: {error_message}")

results_classification_df = pd.DataFrame(results_classification)
display(results_classification_df)
results_classification_df.to_excel('mean_classification_model_results.xlsx', index=False)

{'mean_imputed_data_10':           ID       age    fnlwgt  education-num  capital-gain  capital-loss  \
0       9714  0.178082  0.097373       0.533333      0.000000      0.000000   
1       1826  0.068493  0.129471       0.533333      1.000000      0.000000   
2       9507  0.095890  0.068717       0.533333      0.000000      0.000000   
3      27887  0.082192  0.300917       0.533333      0.000000      0.000000   
4      13104  0.643836  0.106844       0.400000      0.000000      0.000000   
...      ...       ...       ...            ...           ...           ...   
22787  29471  0.295744  0.228504       0.933333      0.000000      0.000000   
22788  21055  0.520548  0.090279       0.606240      0.010571      0.000000   
22789  20870  0.369863  0.123332       0.666667      0.000000      0.307622   
22790  29768  0.287671  0.120619       0.606240      0.000000      0.000000   
22791  11185  0.397260  0.108866       0.800000      0.000000      0.436639   

       hours-per-week  tar

Unnamed: 0,dataset_name,model,accuracy,precision,recall,f1
0,mean_imputed_data_10,k-NearestNeighbour_classifier,0.812366,0.799997,0.812366,0.785603
1,mean_imputed_data_10,DecisionTree_classifier,0.785137,0.767573,0.785137,0.77209
2,mean_imputed_data_40,k-NearestNeighbour_classifier,0.781042,0.768357,0.781042,0.717674
3,mean_imputed_data_40,DecisionTree_classifier,0.774593,0.745485,0.774593,0.747053
4,mean_imputed_data_70,k-NearestNeighbour_classifier,0.762924,0.779608,0.762924,0.661535
5,mean_imputed_data_70,DecisionTree_classifier,0.749616,0.707729,0.749616,0.715716
6,train_data_control,k-NearestNeighbour_classifier,0.811137,0.796149,0.811137,0.788795
7,train_data_control,DecisionTree_classifier,0.807452,0.797109,0.807452,0.800397
