In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from catboost import CatBoostClassifier
from  xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [2]:
dir_path_post = '../cleaned_processed_data'
df = pd.read_csv(os.path.join(dir_path_post,'stage02_preprocessed.csv'))
df.head(5)

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,Class
0,46.0,0.0,0,0,0,0,0,0,0.0,0,...,0.98,0,2.022785,1,109.0,1,0.91,1,120.0,0
1,70.0,1.0,1,0,0,0,0,0,0.0,0,...,0.16,1,1.9,1,175.0,0,0.983264,0,-17.987976,0
2,70.0,1.0,0,0,0,0,0,0,0.0,0,...,0.72,1,1.2,1,61.0,1,0.87,1,70.0,0
3,18.0,1.0,1,0,0,0,0,0,0.0,0,...,0.03,0,2.022743,1,183.0,1,1.3,1,141.0,0
4,59.0,1.0,0,0,0,0,0,0,0.0,0,...,0.72155,0,2.022729,1,72.0,1,0.92,1,78.0,0


- features which are highly correlated with Class label... 

In [3]:
def get_correlation(df):
    corr_values = abs(df.corr(method='spearman')['Class'])
    corr_values = corr_values.drop('Class')
    corr_values = corr_values[corr_values > 0.03]
    display(corr_values)
    return corr_values


In [4]:
corr_values = get_correlation(df)

age                   0.049430
sex                   0.036009
on_thyroxine          0.079947
query_hypothyroid     0.050175
query_hyperthyroid    0.032031
psych                 0.041757
TSH_measured          0.062682
TSH                   0.223612
T3_measured           0.041204
T3                    0.142235
TT4_measured          0.053423
TT4                   0.093563
T4U_measured          0.034699
FTI_measured          0.034857
FTI                   0.087105
Name: Class, dtype: float64

### Train Test Split

In [5]:
def traintest_data(df, features, label):
    X = df[features]
    Y = df[label]
    X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30, random_state=42) 
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = traintest_data(df, corr_values.index, 'Class')

In [6]:
# Classifiers we are going to use
classifiers = {
    "XGBClassifier" : XGBClassifier(learning_rate=0.01),
    "Nearest Neighbors" : KNeighborsClassifier(4),
    "Decision Tree" : DecisionTreeClassifier(class_weight = 'balanced'),
    "Random Forest": RandomForestClassifier(class_weight = 'balanced',random_state = 1),
    "ExtraTrees": ExtraTreesClassifier(class_weight = 'balanced',random_state = 1),
    "CatBoostClassifier" : CatBoostClassifier(max_depth=4,verbose=0),
}

In [7]:
def train_getscore(classifiers,X_train, X_test, y_train, y_test ):
    score_board = pd.DataFrame(columns = ["Classifier", 
                                "Accuracy", 
                                "Precision", 
                                "Recall", 
                                "FScore"])
    for name, clf in classifiers.items():
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        
        precision_score,recall_score,f_score,support = metrics.precision_recall_fscore_support(y_test,y_pred,average='macro')
        score_board=score_board.append({ "Classifier": name,
                           "Accuracy": round(metrics.accuracy_score(y_test, y_pred), 4),
                           "Precision": round(precision_score, 4), 
                           "Recall":round(recall_score, 4), 
                           "FScore":round(f_score, 4)
                           }, ignore_index=True)
        print("Confusion matrix for: ", name)
        display(confusion_matrix(y_test, y_pred))
    score_board.set_index("FScore", inplace=True)
    score_board.sort_values(by="FScore", ascending=False, inplace=True)   
    return score_board
        
display(train_getscore(classifiers, X_train, X_test, y_train, y_test))        



Confusion matrix for:  XGBClassifier


array([[7254,  115,   33,    1],
       [  13,  403,    2,    0],
       [  41,    1,   98,    0],
       [   8,    0,    0,   84]], dtype=int64)

Confusion matrix for:  Nearest Neighbors


array([[7321,   66,   14,    2],
       [ 346,   68,    4,    0],
       [  86,   10,   44,    0],
       [  92,    0,    0,    0]], dtype=int64)

Confusion matrix for:  Decision Tree


array([[7253,   98,   51,    1],
       [  77,  341,    0,    0],
       [  36,    6,   98,    0],
       [   0,    0,    0,   92]], dtype=int64)

Confusion matrix for:  Random Forest


array([[7271,   92,   40,    0],
       [  66,  351,    1,    0],
       [  48,    0,   92,    0],
       [   7,    0,    0,   85]], dtype=int64)

Confusion matrix for:  ExtraTrees


array([[7236,   99,   47,   21],
       [ 164,  253,    1,    0],
       [  53,    0,   87,    0],
       [  54,    0,    0,   38]], dtype=int64)

Confusion matrix for:  CatBoostClassifier


array([[7284,   87,   32,    0],
       [  40,  377,    1,    0],
       [  51,    0,   89,    0],
       [   9,    0,    0,   83]], dtype=int64)

Unnamed: 0_level_0,Classifier,Accuracy,Precision,Recall
FScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.8782,XGBClassifier,0.9734,0.8733,0.8893
0.867,CatBoostClassifier,0.9727,0.8821,0.8559
0.8613,Decision Tree,0.9666,0.8495,0.8739
0.8582,Random Forest,0.9685,0.8669,0.8507
0.691,ExtraTrees,0.9455,0.7428,0.6543
0.4095,Nearest Neighbors,0.923,0.5288,0.3665


In [8]:
# it seems like data is highly imbalanced . if its true then we cannot only rely on 
display(df.shape)
df['Class'].value_counts()

(26841, 27)

0    24698
1     1401
2      450
3      292
Name: Class, dtype: int64

In [9]:
def fdiscretizer(attribute,dataframe):
    enc = LabelEncoder()
    dataframe[attribute] = pd.qcut(dataframe[attribute], 20, duplicates='drop')
    dataframe[attribute] = enc.fit_transform(dataframe[attribute])
    dataframe = dataframe.convert_dtypes(convert_integer=True)

data2 = df.copy()
fdiscretizer('age',data2)
fdiscretizer('TSH',data2)
fdiscretizer('T3',data2)
fdiscretizer('TT4',data2)
fdiscretizer('T4U',data2)
fdiscretizer('FTI',data2)

In [10]:
corr_values = get_correlation(data2)
X_train2, X_test2, y_train2, y_test2 = traintest_data(data2, corr_values.index,'Class')

display(train_getscore(classifiers,X_train2, X_test2, y_train2, y_test2))

age                   0.050317
sex                   0.036009
on_thyroxine          0.079947
query_hypothyroid     0.050175
query_hyperthyroid    0.032031
psych                 0.041757
TSH_measured          0.062682
TSH                   0.222632
T3_measured           0.041204
T3                    0.142219
TT4_measured          0.053423
TT4                   0.093105
T4U_measured          0.034699
FTI_measured          0.034857
FTI                   0.085890
Name: Class, dtype: float64



Confusion matrix for:  XGBClassifier


array([[7253,  117,   32,    1],
       [  46,  364,    8,    0],
       [  46,   23,   71,    0],
       [   8,    0,    0,   84]], dtype=int64)

Confusion matrix for:  Nearest Neighbors


array([[7305,   71,   20,    7],
       [ 275,  135,    8,    0],
       [  70,   21,   49,    0],
       [  88,    0,    2,    2]], dtype=int64)

Confusion matrix for:  Decision Tree


array([[7212,  124,   66,    1],
       [  93,  314,   11,    0],
       [  45,   17,   78,    0],
       [   0,    0,    0,   92]], dtype=int64)

Confusion matrix for:  Random Forest


array([[7264,   94,   44,    1],
       [  93,  315,   10,    0],
       [  47,    9,   84,    0],
       [  11,    0,    0,   81]], dtype=int64)

Confusion matrix for:  ExtraTrees


array([[7216,  107,   49,   31],
       [ 148,  257,   13,    0],
       [  49,   10,   81,    0],
       [  66,    0,    0,   26]], dtype=int64)

Confusion matrix for:  CatBoostClassifier


array([[7274,   88,   41,    0],
       [  61,  349,    8,    0],
       [  38,   11,   91,    0],
       [   5,    0,    0,   87]], dtype=int64)

Unnamed: 0_level_0,Classifier,Accuracy,Precision,Recall
FScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.8531,CatBoostClassifier,0.9687,0.8537,0.8533
0.8219,XGBClassifier,0.9651,0.8341,0.8177
0.8173,Random Forest,0.9616,0.8324,0.8038
0.8051,Decision Tree,0.9557,0.791,0.8206
0.635,ExtraTrees,0.9413,0.6686,0.6127
0.4677,Nearest Neighbors,0.9302,0.5953,0.4204


### F1 score has dropped 
###  Might be discretization is not a valid option
###  will check Standardisation
### T3 feature is now  seems to be highly correlted with Class

In [11]:
scaler = MinMaxScaler()
data3=pd.DataFrame(scaler.fit_transform(df.drop(columns =['Class'])),columns=data2.columns[:-1])
data3['Class'] = df['Class']
# data3 = ((df-df.min())/(df.max()-df.min()))*20

corr_values = get_correlation(data3)
X_train3, X_test3, y_train3, y_test3 = traintest_data(data3, corr_values.index,'Class')

display(train_getscore(classifiers,X_train3, X_test3, y_train3, y_test3))

age                   0.049419
sex                   0.036009
on_thyroxine          0.079947
query_hypothyroid     0.050175
query_hyperthyroid    0.032031
psych                 0.041757
TSH_measured          0.062682
TSH                   0.223577
T3_measured           0.041204
T3                    0.142235
TT4_measured          0.053423
TT4                   0.093558
T4U_measured          0.034699
FTI_measured          0.034857
FTI                   0.087102
Name: Class, dtype: float64



Confusion matrix for:  XGBClassifier


array([[7254,  115,   33,    1],
       [  14,  402,    2,    0],
       [  41,    1,   98,    0],
       [   8,    0,    0,   84]], dtype=int64)

Confusion matrix for:  Nearest Neighbors


array([[7335,   41,   17,   10],
       [ 337,   78,    3,    0],
       [  96,    0,   44,    0],
       [  91,    0,    0,    1]], dtype=int64)

Confusion matrix for:  Decision Tree


array([[7254,   95,   53,    1],
       [  81,  337,    0,    0],
       [  37,    6,   97,    0],
       [   0,    0,    0,   92]], dtype=int64)

Confusion matrix for:  Random Forest


array([[7269,   91,   43,    0],
       [  65,  352,    1,    0],
       [  47,    0,   93,    0],
       [   7,    0,    0,   85]], dtype=int64)

Confusion matrix for:  ExtraTrees


array([[7239,  101,   44,   19],
       [ 179,  238,    1,    0],
       [  52,    0,   88,    0],
       [  59,    0,    0,   33]], dtype=int64)

Confusion matrix for:  CatBoostClassifier


array([[7283,   88,   32,    0],
       [  37,  380,    1,    0],
       [  52,    0,   88,    0],
       [   8,    0,    0,   84]], dtype=int64)

Unnamed: 0_level_0,Classifier,Accuracy,Precision,Recall
FScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.8779,XGBClassifier,0.9733,0.8731,0.8887
0.868,CatBoostClassifier,0.9729,0.8815,0.8586
0.8582,Decision Tree,0.9661,0.8473,0.8697
0.8581,Random Forest,0.9685,0.8643,0.8531
0.6754,ExtraTrees,0.9435,0.74,0.6336
0.4256,Nearest Neighbors,0.9261,0.5918,0.3756


### f1 score has risen from last 
### still the data is imbalanced need to balance the data

In [12]:
smote = SMOTE('not majority',random_state = 1)
X_train_sm, y_train_sm = smote.fit_resample(X_train3,y_train3)
X_test_sm, y_test_sm = smote.fit_resample(X_test3,y_test3)
display(X_train3.shape)
display(X_train_sm.shape)
display(train_getscore(classifiers,X_train_sm, X_test_sm, y_train_sm, y_test_sm))



(18788, 15)

(69180, 15)



Confusion matrix for:  XGBClassifier


array([[6990,  146,  266,    1],
       [  90, 7273,   40,    0],
       [ 181,   76, 7146,    0],
       [   0,    0,    0, 7403]], dtype=int64)

Confusion matrix for:  Nearest Neighbors


array([[6726,  413,  150,  114],
       [1826, 5325,  197,   55],
       [1211,  154, 5984,   54],
       [1509,   63,   75, 5756]], dtype=int64)

Confusion matrix for:  Decision Tree


array([[7229,   95,   78,    1],
       [ 677, 6668,   58,    0],
       [1218,   52, 6133,    0],
       [   0,    0,    0, 7403]], dtype=int64)

Confusion matrix for:  Random Forest


array([[7240,   99,   64,    0],
       [ 295, 7079,   29,    0],
       [ 627,   57, 6719,    0],
       [  73,    0,    0, 7330]], dtype=int64)

Confusion matrix for:  ExtraTrees


array([[7187,  123,   64,   29],
       [1000, 6356,   47,    0],
       [1090,   62, 6251,    0],
       [1017,    2,    0, 6384]], dtype=int64)

Confusion matrix for:  CatBoostClassifier


array([[7178,  118,  105,    2],
       [  80, 7273,   50,    0],
       [ 249,   70, 7084,    0],
       [   0,    0,    0, 7403]], dtype=int64)

Unnamed: 0_level_0,Classifier,Accuracy,Precision,Recall
FScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.9772,CatBoostClassifier,0.9772,0.9773,0.9772
0.9729,XGBClassifier,0.973,0.973,0.973
0.9584,Random Forest,0.958,0.961,0.958
0.9275,Decision Tree,0.9264,0.9372,0.9264
0.889,ExtraTrees,0.884,0.9119,0.884
0.8111,Nearest Neighbors,0.8034,0.8469,0.8034


In [13]:

## Under sampling method
df_negative = data3[data3.Class==0]
df_hypothyroid = data3[data3.Class==1]
df_hyperthyroid = data3[data3.Class==2]
df_sickeuthyroid = data3[data3.Class==3]

df_negative_downsampled = resample(df_negative,replace=False,n_samples=450,random_state=1)
df_hypothyroid_downsampled = resample(df_hypothyroid,replace=False,n_samples=450,random_state=1)
df_hyperthyroid_downsampled = resample(df_hyperthyroid,replace=False,n_samples=450,random_state=1)
#df_sickeuthyroid_downsampled = resample(df_sickeuthyroid,replace=False,n_samples=450,random_state=1)



df_downsampled = pd.concat([df_negative_downsampled,df_hypothyroid_downsampled,df_hyperthyroid,df_sickeuthyroid])
df_downsampled.Class.value_counts()

0    450
1    450
2    450
3    292
Name: Class, dtype: int64

In [14]:
X_train4, X_test4, y_train4, y_test4 = traintest_data(df_downsampled,corr_values.index,'Class')
display(train_getscore(classifiers,X_train4, X_test4, y_train4, y_test4))





Confusion matrix for:  XGBClassifier


array([[123,   4,   7,   1],
       [  2, 132,   2,   0],
       [  0,   1, 140,   0],
       [  0,   0,   0,  81]], dtype=int64)

Confusion matrix for:  Nearest Neighbors


array([[ 92,  24,   8,  11],
       [ 48,  81,   5,   2],
       [ 12,  13, 116,   0],
       [  1,   4,   3,  73]], dtype=int64)

Confusion matrix for:  Decision Tree


array([[121,   4,   9,   1],
       [  4, 128,   4,   0],
       [  3,   2, 136,   0],
       [  0,   0,   0,  81]], dtype=int64)

Confusion matrix for:  Random Forest


array([[126,   4,   5,   0],
       [  3, 131,   2,   0],
       [  0,   0, 141,   0],
       [  0,   0,   0,  81]], dtype=int64)

Confusion matrix for:  ExtraTrees


array([[111,  12,   8,   4],
       [ 11, 124,   1,   0],
       [  3,   3, 135,   0],
       [  0,   0,   0,  81]], dtype=int64)

Confusion matrix for:  CatBoostClassifier


array([[124,   4,   5,   2],
       [  3, 131,   2,   0],
       [  1,   0, 140,   0],
       [  0,   0,   0,  81]], dtype=int64)

Unnamed: 0_level_0,Classifier,Accuracy,Precision,Recall
FScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.9743,Random Forest,0.9716,0.975,0.9741
0.9681,XGBClassifier,0.9655,0.9687,0.9687
0.9674,CatBoostClassifier,0.9655,0.9669,0.9687
0.95,Decision Tree,0.9452,0.9503,0.9505
0.9197,ExtraTrees,0.9148,0.9176,0.9229
0.7477,Nearest Neighbors,0.7343,0.7482,0.7502


In [15]:
# final for generating pickle file

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import gzip, pickle, pickletools

In [17]:
dir_path_post = '../cleaned_processed_data'
df = pd.read_csv(os.path.join(dir_path_post,'stage02_preprocessed.csv'))
df.head(5)

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,Class
0,46.0,0.0,0,0,0,0,0,0,0.0,0,...,0.98,0,2.022785,1,109.0,1,0.91,1,120.0,0
1,70.0,1.0,1,0,0,0,0,0,0.0,0,...,0.16,1,1.9,1,175.0,0,0.983264,0,-17.987976,0
2,70.0,1.0,0,0,0,0,0,0,0.0,0,...,0.72,1,1.2,1,61.0,1,0.87,1,70.0,0
3,18.0,1.0,1,0,0,0,0,0,0.0,0,...,0.03,0,2.022743,1,183.0,1,1.3,1,141.0,0
4,59.0,1.0,0,0,0,0,0,0,0.0,0,...,0.72155,0,2.022729,1,72.0,1,0.92,1,78.0,0


In [18]:


def get_correlation(df):
    corr_values = abs(df.corr(method='spearman')['Class'])
    corr_values = corr_values.drop('Class')
    corr_values = corr_values[corr_values > 0.03]
    display(corr_values)
    return corr_values

def traintest_data(df, features, label):
    X = df[features]
    Y = df[label]
    X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30, random_state=42) 
    return X_train, X_test, y_train, y_test


def scaled_df(df):
    scaler = MinMaxScaler()
    
    data=pd.DataFrame(scaler.fit_transform(df.drop(columns =['Class'])),columns=df.columns[:-1])
    pickle.dump(scaler, open('../model_scaler_picklefile/thyroid_scaler.pickle','wb'))
    data['Class'] = df['Class']
    return data

def smote_resampled(X_train, X_test, y_train, y_test):
    smote = SMOTE('not majority',random_state = 1)
    X_train_sm, y_train_sm = smote.fit_resample(X_train,y_train)
    X_test_sm, y_test_sm = smote.fit_resample(X_test,y_test)
    return X_train_sm,X_test_sm,y_train_sm,y_test_sm


def under_sampled(df):
    df_negative = df[df.Class==0]
    df_hypothyroid = df[df.Class==1]
    df_hyperthyroid = df[df.Class==2]
    df_sickeuthyroid = df[df.Class==3]

    df_negative_downsampled = resample(df_negative,replace=False,n_samples=450,random_state=1)
    df_hypothyroid_downsampled = resample(df_hypothyroid,replace=False,n_samples=450,random_state=1)
    df_hyperthyroid_downsampled = resample(df_hyperthyroid,replace=False,n_samples=450,random_state=1)
    
    df_downsampled = pd.concat([df_negative_downsampled,df_hypothyroid_downsampled,df_hyperthyroid,df_sickeuthyroid])
    return df_downsampled

In [19]:
data = df.copy()

#### List of selected features
selected_features = ['age', 'sex', 'on_thyroxine', 'query_hypothyroid', 'query_hyperthyroid',
       'psych', 'TSH_measured', 'TSH', 'T3_measured', 'T3', 'TT4_measured',
       'TT4', 'T4U_measured', 'FTI_measured', 'FTI']
all_data = ['age', 'sex', 'on_thyroxine', 'query_hypothyroid', 'query_hyperthyroid',
       'psych', 'TSH_measured', 'TSH', 'T3_measured', 'T3', 'TT4_measured',
       'TT4', 'T4U_measured', 'FTI_measured', 'FTI','Class']
data = scaled_df(df[all_data])
df_downsampled = under_sampled(data[all_data])
X_train_final, X_test_final, y_train_final, y_test_final = traintest_data(df_downsampled,selected_features,'Class')
rfc = RandomForestClassifier(class_weight = 'balanced',random_state = 10)
rfc.fit(X_train_final,y_train_final)
y_pred = rfc.predict(X_test_final)

precision_score,recall_score,f_score,support = metrics.precision_recall_fscore_support(y_test_final,y_pred,average='macro')
precision_score,recall_score,f_score

(0.9749543147798961, 0.9741421568627451, 0.974278417288734)

# model pickle

In [20]:
filepath = os.path.join("../model_scaler_picklefile","thyroid_model.pkl")
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(rfc)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [21]:
#### List of selected features
selected_features = ['age', 'sex', 'on_thyroxine', 'query_hypothyroid', 'query_hyperthyroid',
       'psych', 'TSH_measured', 'TSH', 'T3_measured', 'T3', 'TT4_measured',
       'TT4', 'T4U_measured', 'FTI_measured', 'FTI']

In [22]:
df[selected_features].iloc[1,:].values

array([ 7.00000000e+01,  1.00000000e+00,  1.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  1.00000000e+00,  1.60000000e-01,
        1.00000000e+00,  1.90000000e+00,  1.00000000e+00,  1.75000000e+02,
        0.00000000e+00,  0.00000000e+00, -1.79879763e+01])