In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score,cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel

In [2]:
df1 = pd.read_csv('diabetes.csv')
df2 = pd.read_csv('Thyroid_Diff.csv')

Pre-Processing Dataframe 1

In [3]:
df1.info() #already preprocessed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Pre-Processing Dataframe 2

In [4]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   383 non-null    int64 
 1   Gender                383 non-null    object
 2   Smoking               383 non-null    object
 3   Hx Smoking            383 non-null    object
 4   Hx Radiothreapy       383 non-null    object
 5   Thyroid Function      383 non-null    object
 6   Physical Examination  383 non-null    object
 7   Adenopathy            383 non-null    object
 8   Pathology             383 non-null    object
 9   Focality              383 non-null    object
 10  Risk                  383 non-null    object
 11  T                     383 non-null    object
 12  N                     383 non-null    object
 13  M                     383 non-null    object
 14  Stage                 383 non-null    object
 15  Response              383 non-null    ob

In [5]:
list_for_onehot = ['Thyroid Function','Physical Examination','Adenopathy','Pathology',
                   'Pathology','Risk','T','N','Stage','Response']

In [6]:
# binary categorical --> to --> binary numerical

df2['Gender'] = (df2['Gender']=='M').astype(int)
df2['Smoking'] = (df2['Smoking']=='Yes').astype(int)
df2['Hx Smoking'] = (df2['Hx Smoking']=='Yes').astype(int)
df2['Hx Radiothreapy'] = (df2['Hx Radiothreapy']=='Yes').astype(int)
df2['Focality'] = (df2['Focality']=='Uni-Focal').astype(int)
df2['M'] = (df2['M']=='M1').astype(int)
df2['Recurred'] = (df2['Recurred']=='Yes').astype(int)

In [7]:
df2 = pd.get_dummies(df2, columns = list_for_onehot)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 54 columns):
 #   Column                                            Non-Null Count  Dtype
---  ------                                            --------------  -----
 0   Age                                               383 non-null    int64
 1   Gender                                            383 non-null    int32
 2   Smoking                                           383 non-null    int32
 3   Hx Smoking                                        383 non-null    int32
 4   Hx Radiothreapy                                   383 non-null    int32
 5   Focality                                          383 non-null    int32
 6   M                                                 383 non-null    int32
 7   Recurred                                          383 non-null    int32
 8   Thyroid Function_Clinical Hyperthyroidism         383 non-null    bool 
 9   Thyroid Function_Clinical Hypothyroidism   

In [8]:
tf_map = {False:0, True:1}

cols_to_encode = [x for x in range(8,54)]

for col_idx in cols_to_encode:
    df2.iloc[:, col_idx] = df2.iloc[:, col_idx].map(tf_map)

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 54 columns):
 #   Column                                            Non-Null Count  Dtype
---  ------                                            --------------  -----
 0   Age                                               383 non-null    int64
 1   Gender                                            383 non-null    int32
 2   Smoking                                           383 non-null    int32
 3   Hx Smoking                                        383 non-null    int32
 4   Hx Radiothreapy                                   383 non-null    int32
 5   Focality                                          383 non-null    int32
 6   M                                                 383 non-null    int32
 7   Recurred                                          383 non-null    int32
 8   Thyroid Function_Clinical Hyperthyroidism         383 non-null    int64
 9   Thyroid Function_Clinical Hypothyroidism   

TrainTestSplit 

In [9]:
X1 = df1.loc[:,df1.columns!='Outcome']
y1 = df1.loc[:,'Outcome']

X2 = df2.loc[:,df2.columns!='Recurred']
y2 = df2.loc[:,'Recurred']

# a) 75%-25%
X1_train_A, X1_test_A, y1_train_A, y1_test_A = train_test_split(X1, y1, test_size=0.25, random_state=42)
X2_train_A, X2_test_A, y2_train_A, y2_test_A = train_test_split(X2, y2, test_size=0.25, random_state=42)

# b) 66.6%-33.3%
X1_train_B, X1_test_B, y1_train_B, y1_test_B = train_test_split(X1, y1, test_size=0.33, random_state=42)
X2_train_B, X2_test_B, y2_train_B, y2_test_B = train_test_split(X2, y2, test_size=0.33, random_state=42)


5.1 Evaluation on Train-Test Split as 75-25 and 66.6-33.3

In [10]:
classifiers = {
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier()
}


print('For DF1: 75% Train - 25% Test')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X1_train_A, y1_train_A)

    # Evaluate the classifier
    y_pred = clf.predict(X1_test_A)
    accuracy = accuracy_score(y1_test_A, y_pred)
    print('  ',f"{name} Accuracy: {accuracy:.2f}")


print('\nFor DF1: 66.6% Train - 33.3% Test')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X1_train_B, y1_train_B)
    
    # Evaluate the classifier
    y_pred = clf.predict(X1_test_B)
    accuracy = accuracy_score(y1_test_B, y_pred)
    print('  ',f"{name} Accuracy: {accuracy:.2f}")

print('\nFor DF2: 75% Train - 25% Test')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X2_train_A, y2_train_A)

    # Evaluate the classifier
    y_pred = clf.predict(X2_test_A)
    accuracy = accuracy_score(y2_test_A, y_pred)
    print('  ',f"{name} Accuracy: {accuracy:.2f}")

print('\nFor DF2: 66.6% Train - 33.3% Test')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X2_train_B, y2_train_B)

    # Evaluate the classifier
    y_pred = clf.predict(X2_test_B)
    accuracy = accuracy_score(y2_test_B, y_pred)
    print('  ',f"{name} Accuracy: {accuracy:.2f}")

For DF1: 75% Train - 25% Test
   KNN Accuracy: 0.66
   Naive Bayes Accuracy: 0.73
   Decision Tree Accuracy: 0.69

For DF1: 66.6% Train - 33.3% Test
   KNN Accuracy: 0.70
   Naive Bayes Accuracy: 0.74
   Decision Tree Accuracy: 0.71

For DF2: 75% Train - 25% Test
   KNN Accuracy: 0.89
   Naive Bayes Accuracy: 0.96
   Decision Tree Accuracy: 0.95

For DF2: 66.6% Train - 33.3% Test
   KNN Accuracy: 0.86
   Naive Bayes Accuracy: 0.94
   Decision Tree Accuracy: 0.96


5.2 Evaluation using Holdout, Random Subsampling and 5-Fold CV

In [73]:
# a) holdout (70%-30%)

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, stratify=y1, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, stratify=y2, random_state=42)

print('Holdout Method for DF1: 70% Train - 30% Test')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X1_train, y1_train)

    # Evaluate the classifier
    y_pred = clf.predict(X1_test)
    accuracy = accuracy_score(y1_test, y_pred)
    print('  ',f"{name} Accuracy: {accuracy:.2f}")

print('\nHoldout Method for DF2: 70% Train - 30% Test')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X2_train, y2_train)

    # Evaluate the classifier
    y_pred = clf.predict(X2_test)
    accuracy = accuracy_score(y2_test, y_pred)
    print('  ',f"{name} Accuracy: {accuracy:.2f}")


Holdout Method for DF1: 70% Train - 30% Test
   KNN Accuracy: 0.71
   Naive Bayes Accuracy: 0.74
   Decision Tree Accuracy: 0.76

Holdout Method for DF2: 70% Train - 30% Test
   KNN Accuracy: 0.94
   Naive Bayes Accuracy: 0.93
   Decision Tree Accuracy: 0.91


In [11]:
# b) random subsample 

X1_train_1, X1_test_1, y1_train_1, y1_test_1 = train_test_split(X1, y1, test_size=0.2, random_state=42) # 80-20
X1_train_2, X1_test_2, y1_train_2, y1_test_2 = train_test_split(X1, y1, test_size=0.33, random_state=42) # 66.6-33.3
X1_train_3, X1_test_3, y1_train_3, y1_test_3 = train_test_split(X1, y1, test_size=0.3, random_state=42) # 70-30

X2_train_1, X2_test_1, y2_train_1, y2_test_1 = train_test_split(X2, y2, test_size=0.2, random_state=42) # 80-20
X2_train_2, X2_test_2, y2_train_2, y2_test_2 = train_test_split(X2, y2, test_size=0.33, random_state=42) # 66.6-33.3
X2_train_3, X2_test_3, y2_train_3, y2_test_3 = train_test_split(X2, y2, test_size=0.3, random_state=42) # 70-30



print('Random Subsample for DF1:')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X1_train_1, y1_train_1)
    y_pred_1 = clf.predict(X1_test_1)
    acc_1 = accuracy_score(y1_test_1, y_pred_1)

    clf.fit(X1_train_2, y1_train_2)
    y_pred_2 = clf.predict(X1_test_2)
    acc_2 = accuracy_score(y1_test_2, y_pred_2)

    clf.fit(X1_train_3, y1_train_3)
    y_pred_3 = clf.predict(X1_test_3)
    acc_3 = accuracy_score(y1_test_3, y_pred_3)
    
    accuracy = (acc_1 + acc_2 + acc_3)/3
    
    print('  ',f"{name} Accuracy: {accuracy:.2f}")

print('\nRandom Subsample for DF2:')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X2_train_1, y2_train_1)
    y_pred_1 = clf.predict(X2_test_1)
    acc_1 = accuracy_score(y2_test_1, y_pred_1)

    clf.fit(X2_train_2, y2_train_2)
    y_pred_2 = clf.predict(X2_test_2)
    acc_2 = accuracy_score(y2_test_2, y_pred_2)

    clf.fit(X2_train_3, y2_train_3)
    y_pred_3 = clf.predict(X2_test_3)
    acc_3 = accuracy_score(y2_test_3, y_pred_3)

    accuracy = (acc_1 + acc_2 + acc_3)/3
    
    print('  ',f"{name} Accuracy: {accuracy:.2f}")


Random Subsample for DF1:
   KNN Accuracy: 0.68
   Naive Bayes Accuracy: 0.75
   Decision Tree Accuracy: 0.73

Random Subsample for DF2:
   KNN Accuracy: 0.87
   Naive Bayes Accuracy: 0.95
   Decision Tree Accuracy: 0.97


In [71]:
# c) 5-Fold Cross-Validation 

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

scoring = ['accuracy']

models = [KNeighborsClassifier(),GaussianNB(),DecisionTreeClassifier()]

print('5-Fold CV for DF1:')
for model in models:
    result = list()
    scores = cross_validate(model, X1, y1, cv=kf, scoring=scoring)
    
    for value in scores:
        v = str(value)
        mean_score = scores[v].mean()
        std_score = scores[v].std()
        if (v == "fit_time" or v =="score_time"):
          pass
        else:
          print(f"{model} --> {mean_score:.2f} ± {std_score:.2f}")

print('\n5-Fold CV for DF2:')
for model in models:
    result = list()
    scores = cross_validate(model, X2, y2, cv=kf, scoring=scoring)
    
    for value in scores:
        v = str(value)
        mean_score = scores[v].mean()
        std_score = scores[v].std()
        if (v == "fit_time" or v =="score_time"):
          pass
        else:
          print(f"{model} --> {mean_score:.2f} ± {std_score:.2f}")

5-Fold CV for DF1:
KNeighborsClassifier() --> 0.69 ± 0.02
GaussianNB() --> 0.75 ± 0.02
DecisionTreeClassifier() --> 0.72 ± 0.02

5-Fold CV for DF2:
KNeighborsClassifier() --> 0.88 ± 0.03
GaussianNB() --> 0.95 ± 0.01
DecisionTreeClassifier() --> 0.93 ± 0.03


5.3 Results After Scaling the Values

In [75]:
scaler = StandardScaler()

X1 = scaler.fit_transform(X1)
X2 = scaler.fit_transform(X2)

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, stratify=y1, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, stratify=y2, random_state=42)

print('After Scaling values for DF1: 80% Train - 20% Test')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X1_train, y1_train)

    # Evaluate the classifier
    y_pred = clf.predict(X1_test)
    accuracy = accuracy_score(y1_test, y_pred)
    print('  ',f"{name} Accuracy: {accuracy:.2f}")

print('\nAfter Scaling values for DF2: 80% Train - 20% Test')
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X2_train, y2_train)

    # Evaluate the classifier
    y_pred = clf.predict(X2_test)
    accuracy = accuracy_score(y2_test, y_pred)
    print('  ',f"{name} Accuracy: {accuracy:.2f}")


After Scaling values for DF1: 80% Train - 20% Test
   KNN Accuracy: 0.71
   Naive Bayes Accuracy: 0.71
   Decision Tree Accuracy: 0.75

After Scaling values for DF2: 80% Train - 20% Test
   KNN Accuracy: 0.95
   Naive Bayes Accuracy: 0.94
   Decision Tree Accuracy: 0.94
