In [79]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer ,SimpleImputer
from scipy.stats import zscore
from sklearn.compose import ColumnTransformer
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report ,accuracy_score
from sklearn.ensemble import RandomForestClassifier
import ydf
# from sklearn.ensemble import ExtraTreesRegressor

In [80]:
names = ["age","sex","on_thyroxine","query_on_thyroxine","on_antihyroid_meds","sick","pregnant","thyroid_surgery","I131_treatment","query_hypothyroid","query hyperthyroid","lithium","goitre","tumor","hypopituitary","psych","TSH_measured","TSH","T3_measured","T3","TT4_measured","TT4","T4U_measured","T4U","FTI_measured","FTI","TBG_measured","TBG","referral_source","target"]
df = pd.read_csv("data/thyroid0387.data",names=names)
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antihyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target
0,29,F,f,f,f,f,f,f,f,t,...,f,?,f,?,f,?,f,?,other,-[840801013]
1,29,F,f,f,f,f,f,f,f,f,...,t,128,f,?,f,?,f,?,other,-[840801014]
2,41,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,11,other,-[840801042]
3,36,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,26,other,-[840803046]
4,32,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,36,other,S[840803047]


In [81]:
# tidy the target column
df['patient_id'] = df["target"].apply(lambda x: x.split("[")[1].strip(']'))
df['target'] = df["target"].apply(lambda x: x.split("[")[0])

In [82]:
# replacing ? with np.nan
df.replace({"?":np.nan},inplace=True)

In [83]:
# converting object to float
num_cols = ["TSH","T3","TT4","T4U","FTI","TBG"]
for i in num_cols:
    df[i] = df[i].astype(float)

In [84]:
# age cannot be 65526 
# capping age to 100 years
df = df[df["age"] <= 100]

In [85]:
# Remove reduntant columns
df.drop(['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','referral_source','patient_id',"TBG"],axis=1, inplace=True)

In [86]:
# Selecting a subset of target which can be classified as Hyper , hypo or Euthyroid (Negative) state
df = df[df['target'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'AK', 'C|I', 'H|K', 'GK', 'FK', 'GI', 'GKJ', 'D|R', '-'])]
# mapping the target column
mapping = {'-':"Negative",
           'A':'Hyperthyroid','AK':"Hyperthyroid",'B':"Hyperthyroid", 'C':"Hyperthyroid", 'C|I': 'Hyperthyroid', 'D':"Hyperthyroid", 'D|R':"Hyperthyroid",
           'E': "Hypothyroid", 'F': "Hypothyroid", 'FK': "Hypothyroid", "G": "Hypothyroid", "GK": "Hypothyroid", "GI": "Hypothyroid", 'GKJ': 'Hypothyroid', 'H|K': 'Hypothyroid',
          }
df['target'] = df['target'].map(mapping)

In [87]:
# impute some missing values of sex (total = 254 missing) using pregnancy
df["sex"] = np.where((df["sex"].isnull()) & (df["pregnant"]=="t"),'F',df["sex"])

In [88]:
# EDA 
cat_cols = df.select_dtypes(include="object").columns
cat_cols =  cat_cols.drop("target")
num_cols = df.select_dtypes(exclude="object").columns
print(cat_cols)
print(num_cols)

Index(['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antihyroid_meds',
       'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment',
       'query_hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor',
       'hypopituitary', 'psych'],
      dtype='object')
Index(['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI'], dtype='object')


## **Outliers are not removed as they may be important for the model**

In [89]:
df = df.replace({"t":1,"f":0})


In [90]:
# Mapping sex to 0 for female and 1 for male
df["sex"] = df["sex"].map({"F":0,"M":1})

In [91]:
df["target"] = df.target.map({ 'Negative': 0, 'Hypothyroid': 1,'Hyperthyroid': 2})

In [92]:
df.isnull().sum()

age                      0
sex                    252
on_thyroxine             0
query_on_thyroxine       0
on_antihyroid_meds       0
sick                     0
pregnant                 0
thyroid_surgery          0
I131_treatment           0
query_hypothyroid        0
query hyperthyroid       0
lithium                  0
goitre                   0
tumor                    0
hypopituitary            0
psych                    0
TSH                    724
T3                    2208
TT4                    354
T4U                    681
FTI                    674
target                   0
dtype: int64

In [93]:
# # imputer
# imputer = IterativeImputer()
# impute_df = imputer.fit_transform(df)
# impute_df = pd.DataFrame(impute_df, index=df.index, columns=df.columns)
# df.update(impute_df)

In [94]:
X = df.drop("target",axis=1)
y = df["target"]

In [95]:
## Numerical Pipeline
num_pipeline=Pipeline(
steps=[
    ('imputer',IterativeImputer(random_state=42)),
    ('scaler', StandardScaler()),
    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
steps=[
    # ('imputer', SimpleImputer(strategy='most_frequent')),
    ('imputer', IterativeImputer(random_state=42)),
    # ('ordinalencoder',OneHotEncoder(drop='first',handle_unknown='ignore')),
    # ('scaler',StandardScaler(with_mean=False))
    ]

)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)
])

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((6140, 21), (1535, 21), (6140,), (1535,))

In [98]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [99]:
X_train.shape , X_test.shape

((6140, 21), (1535, 21))

In [100]:
y_train.value_counts()

target
0    5415
1     534
2     191
Name: count, dtype: int64

In [101]:
resampler = SMOTETomek(random_state=42)
X_train , y_train = resampler.fit_resample(X_train,y_train)
X_test , y_test = resampler.fit_resample(X_test , y_test)

In [102]:
y_test.value_counts()

target
2    1352
0    1348
1    1348
Name: count, dtype: int64

In [103]:
model = XGBClassifier(objective = "mulit:softmax",
                      num_class = 3,
                      early_stopping_rounds = 10,
                      eval_metric = ["merror","mlogloss"],
                      seed = 42)
model.fit(X_train , y_train,verbose=0 ,eval_set=[(X_test,y_test)])
y_pred = model.predict(X_test)

In [104]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1348
           1       0.97      0.98      0.98      1348
           2       0.99      0.98      0.99      1352

    accuracy                           0.98      4048
   macro avg       0.98      0.98      0.98      4048
weighted avg       0.98      0.98      0.98      4048



In [105]:
print(accuracy_score(y_test,y_pred))

0.9755434782608695


In [106]:
model_1 = RandomForestClassifier(random_state=42)
model_1.fit(X_train, y_train)
y_pred = model_1.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9666501976284585


## Trying ydf

In [110]:
# split dataset into training and testing
train_df , test_df = train_test_split(df,test_size=0.3,random_state=42)
len(train_df) , len(test_df)

(5372, 2303)

In [111]:
model = ydf.GradientBoostedTreesLearner(label="target").train(train_df)

Train model on 5372 examples
Model trained in 0:00:00.755030


In [113]:
model.evaluate(test_df)

Label \ Pred,0,1,2
0,2010,3,14
1,9,200,0
2,10,1,56


In [116]:
model.analyze(test_df).to_file("analysis.html")

### **Yggdrasil Decision Forests (YDF) offered the best accuracy with very minimal preprocessing and that will be used for modular coding**