# Introduction
In this section, we will process some models using processed data

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from scipy import stats
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier

import warnings
warnings.filterwarnings("ignore")
 
%matplotlib inline

In [39]:
# read dataset
data = pd.read_csv('Dataset_edited/processed_data.csv')

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19555 entries, 0 to 19554
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        19555 non-null  float64
 1   sex                        19555 non-null  int64  
 2   on_thyroxine               19555 non-null  int64  
 3   query_on_thyroxine         19555 non-null  int64  
 4   on_antithyroid_medication  19555 non-null  int64  
 5   sick                       19555 non-null  int64  
 6   pregnant                   19555 non-null  int64  
 7   thyroid_surgery            19555 non-null  int64  
 8   query_hypothyroid          19555 non-null  int64  
 9   query_hyperthyroid         19555 non-null  int64  
 10  lithium                    19555 non-null  int64  
 11  goitre                     19555 non-null  int64  
 12  tumor                      19555 non-null  int64  
 13  TSH                        19555 non-null  flo

In [41]:
data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,TSH,T3,TT4,T4U,FTI,Target
0,41.0,0,0,0,0,0,0,0,0,0,0,0,0,1.3,2.5,125.0,1.14,109.0,0
1,70.0,0,0,0,0,0,0,0,0,0,0,0,0,0.72,1.2,61.0,0.87,70.0,0
2,80.0,0,0,0,0,0,0,0,0,0,0,0,0,2.2,0.6,80.0,0.7,115.0,0
3,66.0,0,0,0,0,0,0,0,0,0,0,0,1,0.6,2.2,123.0,0.93,132.0,0
4,68.0,1,0,0,0,0,0,0,0,0,0,0,0,2.4,1.6,83.0,0.89,93.0,0


# Correlation Matrix

### Most related to the _Target_

#### 缩减特征变量  Feature Scaling

In the new dataset, using number to replace name:

0 means False, 1 means True

Sex: 0 means Female, 1 means Male

Target: 0 means 'negativee', 1 means 'hypothyroid', 2 means 'hyperthyroid'

In [42]:
corr_values = abs(data[data.columns].corr()['Target']).drop('Target')
high_corr_values = corr_values[corr_values>0.04]
high_corr_values

sex                   0.071489
on_thyroxine          0.069057
query_hypothyroid     0.060746
query_hyperthyroid    0.079067
tumor                 0.043465
TSH                   0.250161
FTI                   0.052016
Name: Target, dtype: float64

We can know that, except for five parameters [TSH, T3, TT4, T4U, FTI], the sex, on_thyroxine, query_hypothyroid, query_hyperthyroid and tomor are slightly high related to the Target.

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19555 entries, 0 to 19554
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        19555 non-null  float64
 1   sex                        19555 non-null  int64  
 2   on_thyroxine               19555 non-null  int64  
 3   query_on_thyroxine         19555 non-null  int64  
 4   on_antithyroid_medication  19555 non-null  int64  
 5   sick                       19555 non-null  int64  
 6   pregnant                   19555 non-null  int64  
 7   thyroid_surgery            19555 non-null  int64  
 8   query_hypothyroid          19555 non-null  int64  
 9   query_hyperthyroid         19555 non-null  int64  
 10  lithium                    19555 non-null  int64  
 11  goitre                     19555 non-null  int64  
 12  tumor                      19555 non-null  int64  
 13  TSH                        19555 non-null  flo

## Divide dataset - Train and Test

In [44]:
X_columns = ['sex','on_thyroxine','query_hypothyroid','query_hyperthyroid','tumor','TSH','T3','TT4','T4U','FTI']
X = data[X_columns]
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

## Classifier

In [45]:
classifiers = {
    "XGB Classifier" : XGBClassifier(learning_rate=0.1),
    "CatBoost Classifier" : CatBoostClassifier(max_depth=3,verbose=0),
    "K Nearest Neighbors" : KNeighborsClassifier(3),
    "Decision Tree" : DecisionTreeClassifier(class_weight = 'balanced'),
    "Random Forest": RandomForestClassifier(class_weight = 'balanced',random_state = 1),
    "ExtraTrees": ExtraTreesClassifier(class_weight = 'balanced',random_state = 1),
    "MLP Classifier": MLPClassifier(activation="relu",random_state=1),
    "Bagging Classifier": BaggingClassifier(KNeighborsClassifier(), max_samples=0.5),
    "AdaBoost Classifier": AdaBoostClassifier(n_estimators=10),
    "GradientBoosting Classifier": GradientBoostingClassifier(random_state=1)
}

In [46]:
def classification(classifiers, X_train, X_test, y_train, y_test):
    # Creo un dataframe per visualizzare i risultati calcolati
    res = pd.DataFrame(columns=["Classifier", 
                                "Accuracy", 
                                "Precision", 
                                "Recall", 
                                "F1-Score"])    
                                    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        precision, recall, f1, sup = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')
        res = res.append({"Classifier": name, "Accuracy": round(metrics.accuracy_score(y_test, y_pred), 4),
                          "Precision": round(precision, 4), "Recall":round(recall, 4), "F1-Score":round(f1, 4)}, ignore_index=True)
        print("Confusion matrix for: ", name)
        print(confusion_matrix(y_test, y_pred))
        
    res.set_index("Classifier", inplace=True)
    res.sort_values(by="F1-Score", ascending=False, inplace=True)   
    return res

display(classification(classifiers, X_train, X_test, y_train, y_test))

Confusion matrix for:  XGB Classifier
[[6916  112   50]
 [  74  493    0]
 [  30    0  147]]
Confusion matrix for:  CatBoost Classifier
[[6922  110   46]
 [  70  496    1]
 [  42    1  134]]
Confusion matrix for:  K Nearest Neighbors
[[6892  124   62]
 [ 218  348    1]
 [  77    1   99]]
Confusion matrix for:  Decision Tree
[[6899  115   64]
 [  86  481    0]
 [  36    0  141]]
Confusion matrix for:  Random Forest
[[6908  115   55]
 [  75  492    0]
 [  36    0  141]]
Confusion matrix for:  ExtraTrees
[[6897  115   66]
 [ 123  442    2]
 [  40    3  134]]
Confusion matrix for:  MLP Classifier
[[6922  138   18]
 [  87  478    2]
 [  81   10   86]]
Confusion matrix for:  Bagging Classifier
[[6940   96   42]
 [ 254  310    3]
 [  81    1   95]]
Confusion matrix for:  AdaBoost Classifier
[[7036   35    7]
 [ 390  152   25]
 [ 139    0   38]]
Confusion matrix for:  GradientBoosting Classifier
[[6904  124   50]
 [  22  545    0]
 [  38    0  139]]


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GradientBoosting Classifier,0.9701,0.8472,0.9073,0.8749
XGB Classifier,0.966,0.8488,0.8924,0.8695
CatBoost Classifier,0.9655,0.8472,0.8699,0.8582
Random Forest,0.9641,0.838,0.8801,0.8581
Decision Tree,0.9615,0.8258,0.8732,0.848
ExtraTrees,0.9554,0.8099,0.837,0.8224
MLP Classifier,0.957,0.8504,0.769,0.7954
K Nearest Neighbors,0.9383,0.7686,0.7156,0.7399
Bagging Classifier,0.939,0.7981,0.688,0.7343
AdaBoost Classifier,0.9238,0.7619,0.4923,0.5573


In [47]:
display(data.shape)
data.Target.value_counts()

(19555, 19)

0    17702
1     1403
2      450
Name: Target, dtype: int64

# Imbalance Class
## SMOTE - Over-sampling

In [56]:
smote = SMOTE('not majority',random_state = 1)
X_sm, y_sm = smote.fit_sample(X, y)
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.4, random_state=42)

print(X_train.shape)
print(X_train_sm.shape)
display(classification(classifiers,X_train_sm, X_test_sm, y_train_sm, y_test_sm))

(11733, 10)
(31863, 10)
Confusion matrix for:  XGB Classifier
[[6940  146   93]
 [  10 7078    0]
 [   8    0 6968]]
Confusion matrix for:  CatBoost Classifier
[[6900  155  124]
 [  17 7071    0]
 [   6    7 6963]]
Confusion matrix for:  K Nearest Neighbors
[[6688  287  204]
 [  54 7034    0]
 [  53    0 6923]]
Confusion matrix for:  Decision Tree
[[6978  125   76]
 [  93 6995    0]
 [  42    0 6934]]
Confusion matrix for:  Random Forest
[[6977  134   68]
 [  28 7060    0]
 [  10    0 6966]]
Confusion matrix for:  ExtraTrees
[[6947  135   97]
 [  62 7024    2]
 [  20    1 6955]]
Confusion matrix for:  MLP Classifier
[[6846  220  113]
 [  39 7045    4]
 [ 443    0 6533]]
Confusion matrix for:  Bagging Classifier
[[6487  389  303]
 [  27 7059    2]
 [  63    1 6912]]
Confusion matrix for:  AdaBoost Classifier
[[6587  175  417]
 [1835 4969  284]
 [ 108   37 6831]]
Confusion matrix for:  GradientBoosting Classifier
[[6878  158  143]
 [   9 7078    1]
 [  20    0 6956]]


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest,0.9887,0.9888,0.9888,0.9887
XGB Classifier,0.9879,0.988,0.9881,0.9879
CatBoost Classifier,0.9855,0.9856,0.9856,0.9854
ExtraTrees,0.9851,0.9851,0.9852,0.9851
GradientBoosting Classifier,0.9844,0.9846,0.9846,0.9844
Decision Tree,0.9842,0.9842,0.9843,0.9842
K Nearest Neighbors,0.9718,0.9721,0.9721,0.9718
Bagging Classifier,0.963,0.9639,0.9634,0.9628
MLP Classifier,0.9614,0.9621,0.9613,0.9615
AdaBoost Classifier,0.8656,0.8794,0.8659,0.8634


## Under-sampling

In [50]:
negative_df = data[data.Target==0]
hypothyroid_df = data[data.Target==1]
hyperthyroid_df = data[data.Target==2]

negative_undersampled = resample(negative_df,replace=False,n_samples=450,random_state=123)
hypothyroid_undersampled = resample(hypothyroid_df,replace=False,n_samples=450,random_state=123)

downsampled_df = pd.concat([negative_undersampled,hypothyroid_undersampled,hyperthyroid_df])
downsampled_df.Target.value_counts()

2    450
1    450
0    450
Name: Target, dtype: int64

In [51]:
X_under = downsampled_df[X_columns]
y_under = downsampled_df['Target']
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under, y_under, test_size=0.3, random_state=42)
display(classification(classifiers,X_train_under, X_test_under, y_train_under, y_test_under))

Confusion matrix for:  XGB Classifier
[[124  10   3]
 [  0 129   0]
 [  1   0 138]]
Confusion matrix for:  CatBoost Classifier
[[126   8   3]
 [  1 128   0]
 [  1   0 138]]
Confusion matrix for:  K Nearest Neighbors
[[109  14  14]
 [ 18 110   1]
 [  7   1 131]]
Confusion matrix for:  Decision Tree
[[124   9   4]
 [ 11 118   0]
 [  3   0 136]]
Confusion matrix for:  Random Forest
[[127   7   3]
 [  1 128   0]
 [  0   0 139]]
Confusion matrix for:  ExtraTrees
[[118  14   5]
 [  6 118   5]
 [  4   0 135]]
Confusion matrix for:  MLP Classifier
[[114  11  12]
 [  6 122   1]
 [  8   2 129]]
Confusion matrix for:  Bagging Classifier
[[104  22  11]
 [ 24 100   5]
 [  7   3 129]]
Confusion matrix for:  AdaBoost Classifier
[[125   6   6]
 [ 36  87   6]
 [ 19   0 120]]
Confusion matrix for:  GradientBoosting Classifier
[[127   7   3]
 [  1 128   0]
 [  1   0 138]]


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest,0.9728,0.9731,0.9731,0.9725
GradientBoosting Classifier,0.9704,0.9705,0.9707,0.9701
CatBoost Classifier,0.9679,0.9681,0.9683,0.9676
XGB Classifier,0.9654,0.9663,0.966,0.965
Decision Tree,0.9333,0.933,0.9328,0.9329
ExtraTrees,0.916,0.9156,0.9158,0.9152
MLP Classifier,0.9012,0.9009,0.902,0.9009
K Nearest Neighbors,0.8642,0.8636,0.8636,0.8633
Bagging Classifier,0.8222,0.82,0.8208,0.8202
AdaBoost Classifier,0.8198,0.8463,0.8167,0.8193


## Tuning

The top four classifers are: XGBoost, CatBoost, Gradient Boosting and Random Forest.

### GridSearchCV

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
# C_vals is the alpla value of lasso and ridge regression(as 
# alpha increases the model complexity decreases,)

# remember effective alpha scores are 0<alpha<infinity 
C_vals = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]

## Choose a cross validation strategy. 
cv = StratifiedShuffleSplit(n_splits = 10, test_size = .25)

# setting param for param_grid in GridSearchCV. 
param = {'C': C_vals}

n_estimators = [140,145,150,155,160];
max_depth = range(1,10);
criterions = ['gini', 'entropy'];
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)


parameters = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'criterion': criterions
              
        }

### XGBoost Modelling

We estimate the 'max_depth', 'n_estimators', and 'learning_rate'

In [119]:
XGB_classifier = {"XGB Classifier": XGBClassifier(learning_rate=0.1,max_depth=6, n_estimators=2000)}
display(classification(XGB_classifier, X_train_under, X_test_under, y_train_under, y_test_under))


Confusion matrix for:  XGB Classifier
[[124  10   3]
 [  1 128   0]
 [  1   0 138]]


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGB Classifier,0.963,0.9635,0.9634,0.9625


In [124]:
parameters = {
              'max_depth':[3,6,9], 
              'n_estimators':[50,100,200,500], 
              'learning_rate':[0.01,0.05,0.1,0.2,0.5]
}

grid = GridSearchCV( XGBClassifier(),
                     param_grid=parameters,
                     cv=6,
                     scoring='roc_auc')

grid.fit(X_under, y_under) 

In [123]:
## Getting the best of everything. 
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

nan
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, ...)


In [88]:
xgb_grid = grid.best_estimator_
xgb_grid.score(X_under, y_under)

0.7459259259259259

In [70]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':[1,3,5], 
              'n_estimators':[1,25,50,100], 
              'learning_rate':[0.01,0.05,0.1,0.2]
}


xgb = XGBClassifier()
grid_search = GridSearchCV(xgb, parameters, scoring='roc_auc', cv=5)

# train model
grid_search.fit(X_train_sm, y_train_sm)
# output the best parameters
grid_search.best_params_

{'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 1}

In [71]:
# Train the model using above parameter value
clf = XGBClassifier(max_depth=1,n_estimators=1,learning_rate=0.01)
clf.fit(X_train, y_train)

In [72]:
# 查看AUC
y_pred = clf.predict(X_test)
name = "XBGoost"
res = pd.DataFrame(columns=["Classifier", 
                                "Accuracy", 
                                "Precision", 
                                "Recall", 
                                "F1-Score"])

precision, recall, f1, sup = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')
res = res.append({"Classifier": name, "Accuracy": round(metrics.accuracy_score(y_test, y_pred), 4),
                  "Precision": round(precision, 4), "Recall":round(recall, 4), "F1-Score":round(f1, 4)}, ignore_index=True)
print("Confusion matrix for: ", name)
print(confusion_matrix(y_test, y_pred))
print(res)

Confusion matrix for:  XBGoost
[[6888  190    0]
 [ 156  411    0]
 [ 143   34    0]]
  Classifier  Accuracy  Precision  Recall  F1-Score
0    XBGoost    0.9331     0.5352   0.566    0.5499


### Random Forest

In [125]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
n_estimators = [140,145,150,155,160];
max_depth = range(1,10);
criterions = ['gini', 'entropy'];
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)


parameters = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'criterion': criterions
              
        }
grid = GridSearchCV(estimator=RandomForestClassifier(max_features='auto'),
                                 param_grid=parameters,
                                 cv=cv,
                                 n_jobs = -1)
# Train
grid.fit(X_under,y_under) 
# Output the best parameters
grid.best_params_

{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 140}

可以得到结果xxx

In [133]:
# Train the model using above parameter value
randomforest = RandomForestClassifier(n_estimators=140, max_depth=8,criterion='entropy',max_features='auto')

display(classification(randomforest,X_train_under,y_train_under,X_test_under,y_test_under))

AttributeError: 'RandomForestClassifier' object has no attribute 'items'

In [None]:
# 这个结果优于调参前的xxx

In [None]:
rmse=nestimators=[]
for n in [20,30,50,80,100,200,300,400,500,600,700,800]:
    regressor = RandomForestRegressor(n_estimators=n, random_state=0)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    print('-------------------')
    print('n_estimators={}'.format(n))
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    rmse=np.append(rmse,np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    nestimators=np.append(nestimators,n)

In [None]:
rmse

In [None]:
nestimators

In [None]:
# Creating a bar plot
sns.set_style('whitegrid')
plt.plot(nestimators,rmse,'ro',linestyle='dashed',linewidth=1,markersize=10)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features/RMSE")
plt.show()

## XGBoost

## ROC 

# Conclusion:

加入图像

优化
