# Introduction
In this section, we will process some models using processed data

In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from scipy import stats
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier

import warnings
warnings.filterwarnings("ignore")
 
%matplotlib inline

In [71]:
# read dataset
data = pd.read_csv('Dataset_edited/processed_data.csv')

In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19555 entries, 0 to 19554
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        19555 non-null  float64
 1   sex                        19555 non-null  int64  
 2   on_thyroxine               19555 non-null  int64  
 3   query_on_thyroxine         19555 non-null  int64  
 4   on_antithyroid_medication  19555 non-null  int64  
 5   sick                       19555 non-null  int64  
 6   pregnant                   19555 non-null  int64  
 7   thyroid_surgery            19555 non-null  int64  
 8   query_hypothyroid          19555 non-null  int64  
 9   query_hyperthyroid         19555 non-null  int64  
 10  lithium                    19555 non-null  int64  
 11  goitre                     19555 non-null  int64  
 12  tumor                      19555 non-null  int64  
 13  TSH                        19555 non-null  flo

In [73]:
data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,TSH,T3,TT4,T4U,FTI,Target
0,41.0,0,0,0,0,0,0,0,0,0,0,0,0,1.3,2.5,125.0,1.14,109.0,0
1,70.0,0,0,0,0,0,0,0,0,0,0,0,0,0.72,1.2,61.0,0.87,70.0,0
2,80.0,0,0,0,0,0,0,0,0,0,0,0,0,2.2,0.6,80.0,0.7,115.0,0
3,66.0,0,0,0,0,0,0,0,0,0,0,0,1,0.6,2.2,123.0,0.93,132.0,0
4,68.0,1,0,0,0,0,0,0,0,0,0,0,0,2.4,1.6,83.0,0.89,93.0,0


# Correlation Matrix

### Most related to the _Target_

In the new dataset, using number to replace name:

0 means False, 1 means True

Sex: 0 means Female, 1 means Male

Target: 0 means 'negativee', 1 means 'hypothyroid', 2 means 'hyperthyroid'

In [74]:
corr_values = abs(data[data.columns].corr()['Target']).drop('Target')
high_corr_values = corr_values[corr_values>0.05]
high_corr_values

sex                   0.071489
on_thyroxine          0.069057
query_hypothyroid     0.060746
query_hyperthyroid    0.079067
TSH                   0.250161
FTI                   0.052016
Name: Target, dtype: float64

We can know that, except for five parameters [TSH, T3, TT4, T4U, FTI], the sex, on_thyroxine, query_hypothyroid and query_hyperthyroid are slightly high related to the Target.

In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19555 entries, 0 to 19554
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        19555 non-null  float64
 1   sex                        19555 non-null  int64  
 2   on_thyroxine               19555 non-null  int64  
 3   query_on_thyroxine         19555 non-null  int64  
 4   on_antithyroid_medication  19555 non-null  int64  
 5   sick                       19555 non-null  int64  
 6   pregnant                   19555 non-null  int64  
 7   thyroid_surgery            19555 non-null  int64  
 8   query_hypothyroid          19555 non-null  int64  
 9   query_hyperthyroid         19555 non-null  int64  
 10  lithium                    19555 non-null  int64  
 11  goitre                     19555 non-null  int64  
 12  tumor                      19555 non-null  int64  
 13  TSH                        19555 non-null  flo

## Divide dataset - Train and Test

In [76]:
X_columns = ['sex','on_thyroxine','query_hypothyroid','query_hyperthyroid','TSH','T3','TT4','T4U','FTI']
X = data[X_columns]
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

## Classifier

In [77]:
classifiers = {
    "XGBClassifier" : XGBClassifier(learning_rate=0.01),
    "CatBoostClassifier" : CatBoostClassifier(max_depth=4,verbose=0),
    "K Nearest Neighbors" : KNeighborsClassifier(4),
    "Decision Tree" : DecisionTreeClassifier(class_weight = 'balanced'),
    "Random Forest": RandomForestClassifier(class_weight = 'balanced',random_state = 1),
    "ExtraTrees": ExtraTreesClassifier(class_weight = 'balanced',random_state = 1),
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(256,128,64,32),activation="relu",random_state=1),
    "GaussianNB": GaussianNB(),
    "Bagging Classifier": BaggingClassifier(KNeighborsClassifier(), max_samples=0.5),
    "AdaBoost Classifier": AdaBoostClassifier(n_estimators=10),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=1)
}

In [78]:
def classification(classifiers, X_train, X_test, y_train, y_test):
    # Creo un dataframe per visualizzare i risultati calcolati
    res = pd.DataFrame(columns=["Classifier", 
                                "Accuracy", 
                                "Precision", 
                                "Recall", 
                                "F1-Score"])    
                                    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        precision, recall, f1, sup = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')
        res = res.append({"Classifier": name, "Accuracy": round(metrics.accuracy_score(y_test, y_pred), 4),
                          "Precision": round(precision, 4), "Recall":round(recall, 4), "F1-Score":round(f1, 4)}, ignore_index=True)
        print("Confusion matrix for: ", name)
        print(confusion_matrix(y_test, y_pred))
        
    res.set_index("Classifier", inplace=True)
    res.sort_values(by="F1-Score", ascending=False, inplace=True)   
    return res

display(classification(classifiers, X_train, X_test, y_train, y_test))

Confusion matrix for:  XGBClassifier
[[6904  133   41]
 [  18  549    0]
 [  38    0  139]]
Confusion matrix for:  CatBoostClassifier
[[6918  112   48]
 [  80  487    0]
 [  40    2  135]]
Confusion matrix for:  K Nearest Neighbors
[[6964   93   21]
 [ 298  267    2]
 [  99    2   76]]
Confusion matrix for:  Decision Tree
[[6895  119   64]
 [  81  486    0]
 [  40    0  137]]
Confusion matrix for:  Random Forest
[[6902  118   58]
 [  73  494    0]
 [  32    0  145]]
Confusion matrix for:  ExtraTrees
[[6894  115   69]
 [ 113  452    2]
 [  41    2  134]]
Confusion matrix for:  MLPClassifier
[[6830  148  100]
 [  48  516    3]
 [  32    2  143]]
Confusion matrix for:  GaussianNB
[[6572  174  332]
 [ 364  190   13]
 [  88   13   76]]
Confusion matrix for:  Bagging Classifier
[[6942   94   42]
 [ 249  315    3]
 [  80    3   94]]
Confusion matrix for:  AdaBoost Classifier
[[7043   30    5]
 [ 411  156    0]
 [ 135   18   24]]
Confusion matrix for:  Gradient Boosting Classifier
[[6912  122 

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting Classifier,0.9712,0.8568,0.9109,0.8818
XGBClassifier,0.9706,0.8564,0.9097,0.8805
Random Forest,0.9641,0.8355,0.8885,0.8604
CatBoostClassifier,0.9639,0.8437,0.8663,0.8547
Decision Tree,0.9611,0.8226,0.8684,0.8442
MLPClassifier,0.9574,0.7815,0.8943,0.8299
ExtraTrees,0.9563,0.8087,0.8427,0.8245
Bagging Classifier,0.9398,0.7985,0.6891,0.7353
K Nearest Neighbors,0.9342,0.8171,0.6281,0.6967
AdaBoost Classifier,0.9234,0.8401,0.4686,0.5327


In [79]:
display(data.shape)
data.Target.value_counts()

(19555, 19)

0    17702
1     1403
2      450
Name: Target, dtype: int64

# Imbalance Class
## SMOTE - Over-sampling

In [80]:
smote = SMOTE('not majority',random_state = 1)
X_train_sm, y_train_sm = smote.fit_sample(X_train,y_train)
X_test_sm, y_test_sm = smote.fit_sample(X_test,y_test)
print(X_train.shape)
print(X_train_sm.shape)
display(classification(classifiers,X_train_sm, X_test_sm, y_train_sm, y_test_sm))

(11733, 9)
(31872, 9)
Confusion matrix for:  XGBClassifier
[[6715  141  222]
 [  68 6988   22]
 [  72    7 6999]]
Confusion matrix for:  CatBoostClassifier
[[6829  135  114]
 [  77 6971   30]
 [  98   19 6961]]
Confusion matrix for:  K Nearest Neighbors
[[6658  229  191]
 [ 849 6194   35]
 [ 943   17 6118]]
Confusion matrix for:  Decision Tree
[[6904  105   69]
 [ 658 6411    9]
 [ 544    5 6529]]
Confusion matrix for:  Random Forest
[[6881  123   74]
 [ 279 6793    6]
 [ 301    6 6771]]
Confusion matrix for:  ExtraTrees
[[6868  111   99]
 [ 617 6430   31]
 [ 424   19 6635]]
Confusion matrix for:  MLPClassifier
[[6787  146  145]
 [ 102 6964   12]
 [ 236   24 6818]]
Confusion matrix for:  GaussianNB
[[3219 1682 2177]
 [ 566 5445 1067]
 [ 228 1794 5056]]
Confusion matrix for:  Bagging Classifier
[[6484  314  280]
 [ 420 6614   44]
 [ 475   35 6568]]
Confusion matrix for:  AdaBoost Classifier
[[6551  230  297]
 [  42 6935  101]
 [1673   19 5386]]
Confusion matrix for:  Gradient Boosting C

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CatBoostClassifier,0.9777,0.9777,0.9777,0.9777
Gradient Boosting Classifier,0.9777,0.9777,0.9777,0.9777
XGBClassifier,0.9749,0.975,0.9749,0.9749
MLPClassifier,0.9687,0.9687,0.9687,0.9687
Random Forest,0.9628,0.964,0.9628,0.9631
ExtraTrees,0.9387,0.9431,0.9387,0.9394
Decision Tree,0.9345,0.941,0.9345,0.9355
Bagging Classifier,0.9262,0.9272,0.9262,0.9265
K Nearest Neighbors,0.8934,0.9047,0.8934,0.8952
AdaBoost Classifier,0.8888,0.8964,0.8888,0.888


## Under-sampling

In [81]:
df_negative = data[data.Target==0]
df_hypothyroid = data[data.Target==1]
df_hyperthyroid = data[data.Target==2]

df_negative_undersampled = resample(df_negative,replace=False,n_samples=450,random_state=123)
df_hypothyroid_undersampled = resample(df_hypothyroid,replace=False,n_samples=450,random_state=123)

df_downsampled = pd.concat([df_negative_undersampled,df_hypothyroid_undersampled,df_hyperthyroid])
df_downsampled.Target.value_counts()

2    450
1    450
0    450
Name: Target, dtype: int64

In [82]:
X4 = df_downsampled[X_columns]
y4 = df_downsampled['Target']
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size=0.3, random_state=42)
display(classification(classifiers,X_train4, X_test4, y_train4, y_test4))

Confusion matrix for:  XGBClassifier
[[122  10   5]
 [  0 129   0]
 [  0   0 139]]
Confusion matrix for:  CatBoostClassifier
[[127   7   3]
 [  0 129   0]
 [  0   0 139]]
Confusion matrix for:  K Nearest Neighbors
[[115  12  10]
 [ 32  96   1]
 [ 12   1 126]]
Confusion matrix for:  Decision Tree
[[125   8   4]
 [ 12 117   0]
 [  3   0 136]]
Confusion matrix for:  Random Forest
[[127   7   3]
 [  1 128   0]
 [  0   0 139]]
Confusion matrix for:  ExtraTrees
[[117  14   6]
 [  5 122   2]
 [  3   0 136]]
Confusion matrix for:  MLPClassifier
[[121  13   3]
 [  2 127   0]
 [ 30   2 107]]
Confusion matrix for:  GaussianNB
[[119  10   8]
 [ 32  79  18]
 [ 37  20  82]]
Confusion matrix for:  Bagging Classifier
[[109  20   8]
 [ 33  93   3]
 [ 10   3 126]]
Confusion matrix for:  AdaBoost Classifier
[[125   6   6]
 [ 36  87   6]
 [ 19   0 120]]
Confusion matrix for:  Gradient Boosting Classifier
[[127   7   3]
 [  1 128   0]
 [  0   0 139]]


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CatBoostClassifier,0.9753,0.9758,0.9757,0.975
Random Forest,0.9728,0.9731,0.9731,0.9725
Gradient Boosting Classifier,0.9728,0.9731,0.9731,0.9725
XGBClassifier,0.963,0.9644,0.9635,0.9624
Decision Tree,0.9333,0.9334,0.9326,0.9329
ExtraTrees,0.9259,0.9258,0.9261,0.925
MLPClassifier,0.8765,0.886,0.8792,0.8771
K Nearest Neighbors,0.8321,0.8412,0.83,0.8323
AdaBoost Classifier,0.8198,0.8463,0.8167,0.8193
Bagging Classifier,0.8099,0.8128,0.8077,0.8089


# Conclusion:

????