In [43]:
## IMPORTS 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [44]:
df = pd.read_csv("thyroidDF.csv")
print(df.shape) # 9172 rows, 31 columns
print(df.size)
df.head()

(9172, 31)
284332


Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047


In [45]:
unused_columns = ["lithium", "hypopituitary", "psych", "referral_source", "patient_id", "query_on_thyroxine"] ## lack of infomation on feature 
df.drop(columns = unused_columns, inplace=True)

In [46]:
df.isna().sum()

age                       0
sex                     307
on_thyroxine              0
on_antithyroid_meds       0
sick                      0
pregnant                  0
thyroid_surgery           0
I131_treatment            0
query_hypothyroid         0
query_hyperthyroid        0
goitre                    0
tumor                     0
TSH_measured              0
TSH                     842
T3_measured               0
T3                     2604
TT4_measured              0
TT4                     442
T4U_measured              0
T4U                     809
FTI_measured              0
FTI                     802
TBG_measured              0
TBG                    8823
target                    0
dtype: int64

In [47]:
## HANDLING NULL VALUES 
df.dropna(subset=["sex"], inplace=True) # only 307 observations do not have 
df.drop(columns=["TBG_measured", "TBG"], inplace=True) # more than 50% of observations are null
blood_work_cols = ["TSH", "T3", "TT4", "T4U", "FTI"]
for col in blood_work_cols:
    df[col] = df[col].fillna(df[col].mean())

# checking...
df.isna().sum()

age                    0
sex                    0
on_thyroxine           0
on_antithyroid_meds    0
sick                   0
pregnant               0
thyroid_surgery        0
I131_treatment         0
query_hypothyroid      0
query_hyperthyroid     0
goitre                 0
tumor                  0
TSH_measured           0
TSH                    0
T3_measured            0
T3                     0
TT4_measured           0
TT4                    0
T4U_measured           0
T4U                    0
FTI_measured           0
FTI                    0
target                 0
dtype: int64

In [48]:
df.dtypes

age                      int64
sex                     object
on_thyroxine            object
on_antithyroid_meds     object
sick                    object
pregnant                object
thyroid_surgery         object
I131_treatment          object
query_hypothyroid       object
query_hyperthyroid      object
goitre                  object
tumor                   object
TSH_measured            object
TSH                    float64
T3_measured             object
T3                     float64
TT4_measured            object
TT4                    float64
T4U_measured            object
T4U                    float64
FTI_measured            object
FTI                    float64
target                  object
dtype: object

In [49]:
## HANDING TYPES 
df["sex"] = (df["sex"] == "M").astype(int) 

cols = ["on_thyroxine", "on_antithyroid_meds", "sick", "pregnant", "thyroid_surgery", "I131_treatment", "query_hypothyroid", "query_hyperthyroid", "goitre", "tumor", "TSH_measured", "T3_measured", "TT4_measured", "T4U_measured", "FTI_measured"]
for col in cols:
    df[col] = (df[col] == "t").astype(int)

# checking...
df.dtypes

age                      int64
sex                      int64
on_thyroxine             int64
on_antithyroid_meds      int64
sick                     int64
pregnant                 int64
thyroid_surgery          int64
I131_treatment           int64
query_hypothyroid        int64
query_hyperthyroid       int64
goitre                   int64
tumor                    int64
TSH_measured             int64
TSH                    float64
T3_measured              int64
T3                     float64
TT4_measured             int64
TT4                    float64
T4U_measured             int64
T4U                    float64
FTI_measured             int64
FTI                    float64
target                  object
dtype: object

In [50]:
df.describe()

Unnamed: 0,age,sex,on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,...,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI
count,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0,...,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0,8865.0
mean,74.363113,0.314946,0.137507,0.013085,0.037676,0.011619,0.014777,0.0185,0.069712,0.071968,...,0.908291,5.173817,0.715623,1.962311,0.952623,108.537556,0.912578,0.977436,0.913367,112.83876
std,1204.292368,0.464521,0.344401,0.113646,0.190423,0.107168,0.120667,0.134757,0.254676,0.25845,...,0.288631,22.790189,0.451143,0.741853,0.212457,36.45902,0.282469,0.188871,0.281312,35.605985
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.005,0.0,0.05,0.0,2.0,0.0,0.17,0.0,1.4
25%,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.56,0.0,1.7,1.0,88.0,1.0,0.87,1.0,95.0
50%,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.5,1.0,1.962311,1.0,106.0,1.0,0.977436,1.0,112.0
75%,68.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.7,1.0,2.1,1.0,124.0,1.0,1.05,1.0,126.0
max,65526.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,530.0,1.0,18.0,1.0,600.0,1.0,2.33,1.0,642.0


In [51]:
df[df["age"] > 100] ## we have four patients with ages over 100, with the lowest starting at 455, which is impossible

Unnamed: 0,age,sex,on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,...,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,target
2976,455,0,0,0,0,0,0,0,0,0,...,1.1,1,2.0,1,118.0,1,1.13,1,104.0,-
5710,65511,1,0,0,0,0,0,0,0,0,...,0.38,1,2.0,1,113.0,1,1.08,1,104.0,-
6392,65512,1,0,0,0,0,0,0,0,1,...,0.03,1,2.8,1,112.0,1,0.84,1,133.0,-
8105,65526,0,0,0,0,0,0,0,0,0,...,1.5,0,1.962311,1,132.0,1,1.02,1,129.0,-


In [52]:
df = df[df["age"] <= 100]

In [53]:
df['target'].value_counts()

target
-      6555
K       425
G       343
I       329
F       226
R       183
A       141
L       114
M       109
N       107
S        83
GK       48
AK       45
J        27
B        19
MK       16
O        14
Q        14
C|I      11
KJ       10
GI       10
H|K       8
C         6
FK        5
P         5
MI        2
LJ        1
GKJ       1
OI        1
D|R       1
D         1
E         1
Name: count, dtype: int64

In [54]:
def condition(target):
    if "|" in target:
        diagnosis = target[-1] ## diagnosis of the form "X|Y" is interpreted as "consistent with X, but more likely Y"
    else:
        diagnosis = target[0] ## multiple diagnoses 

    hyperthyroid = ['A', 'B', 'C', 'D']
    hypothyroid = ['E', 'F', 'G', 'H']

    if diagnosis in hyperthyroid:
        return "Hyperthyroidism"
    elif diagnosis in hypothyroid:
        return "Hypothyroidism"
    else:
        return "Normal"

df["target"] = df["target"].apply(condition)
df["target"].value_counts()

target
Normal             8015
Hypothyroidism      634
Hyperthyroidism     212
Name: count, dtype: int64

In [55]:
unused_features = ["TSH_measured", "T3_measured", "TT4_measured", "T4U_measured", "FTI_measured"]
df.drop(columns = unused_features, inplace = True)

x = df.drop(columns = "target")
y = df["target"]

## SPLITTING THE DATASET INTO TRAINING AND TESTING DATA
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=47)

In [56]:
df.dtypes

age                      int64
sex                      int64
on_thyroxine             int64
on_antithyroid_meds      int64
sick                     int64
pregnant                 int64
thyroid_surgery          int64
I131_treatment           int64
query_hypothyroid        int64
query_hyperthyroid       int64
goitre                   int64
tumor                    int64
TSH                    float64
T3                     float64
TT4                    float64
T4U                    float64
FTI                    float64
target                  object
dtype: object

In [57]:
## STANDARDIZING THE DATASET
scalar = StandardScaler()
numeric_features = ["TSH", "T3", "TT4", "T4U", "FTI"] ## only standardizing numeric features 
x_train_scaled = x_train.copy()
x_train_scaled[numeric_features] = scalar.fit_transform(x_train[numeric_features])

x_test_scaled = x_test.copy()
x_test_scaled[numeric_features] = scalar.transform(x_test[numeric_features])


In [58]:
from sklearn.linear_model import LogisticRegression

## MULTI CLASS LOGISTICAL REGRESSION
model = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=200)
model.fit(x_train_scaled, y_train)

y_pred_lr = model.predict(x_test_scaled)

accuracy= accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))




Accuracy: 0.94

Classification Report:
                  precision    recall  f1-score   support

Hyperthyroidism       0.78      0.42      0.55        43
 Hypothyroidism       0.84      0.50      0.63       115
         Normal       0.95      0.99      0.97      1615

       accuracy                           0.94      1773
      macro avg       0.86      0.64      0.72      1773
   weighted avg       0.94      0.94      0.94      1773



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=200).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
from sklearn.tree import DecisionTreeClassifier

## DECISION TREE
dt_model = DecisionTreeClassifier(random_state=47)
dt_model.fit(x_train_scaled, y_train)

y_pred_dt = dt_model.predict(x_test_scaled)

accuracy= accuracy_score(y_test, y_pred_dt)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))



Accuracy: 0.98

Classification Report:
                  precision    recall  f1-score   support

Hyperthyroidism       0.67      0.65      0.66        43
 Hypothyroidism       0.96      0.97      0.97       115
         Normal       0.99      0.99      0.99      1615

       accuracy                           0.98      1773
      macro avg       0.87      0.87      0.87      1773
   weighted avg       0.98      0.98      0.98      1773



In [60]:
from sklearn.ensemble import RandomForestClassifier

## RANDOM FOREST
rf_model = RandomForestClassifier(max_depth=8, random_state=47)
rf_model.fit(x_train_scaled, y_train)

y_pred_rf = rf_model.predict(x_test_scaled)

accuracy= accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))



Accuracy: 0.98

Classification Report:
                  precision    recall  f1-score   support

Hyperthyroidism       0.83      0.44      0.58        43
 Hypothyroidism       0.93      0.99      0.96       115
         Normal       0.98      0.99      0.99      1615

       accuracy                           0.98      1773
      macro avg       0.91      0.81      0.84      1773
   weighted avg       0.98      0.98      0.98      1773



In [61]:
from sklearn.svm import SVC

## SVM MODEL
svm_model = SVC(kernel='linear', decision_function_shape='ovr', random_state=42)
svm_model.fit(x_train_scaled, y_train)

y_pred_svm = svm_model.predict(x_test_scaled)

accuracy= accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))




Accuracy: 0.95

Classification Report:
                  precision    recall  f1-score   support

Hyperthyroidism       0.81      0.40      0.53        43
 Hypothyroidism       0.85      0.56      0.67       115
         Normal       0.95      0.99      0.97      1615

       accuracy                           0.95      1773
      macro avg       0.87      0.65      0.73      1773
   weighted avg       0.94      0.95      0.94      1773



In [None]:
from sklearn.neural_network import MLPClassifier
import pickle

## NEURAL NETWORK
nn_model = MLPClassifier(random_state=47, max_iter=500)
nn_model.fit(x_train, y_train)

y_pred_nn = nn_model.predict(x_test)

accuracy= accuracy_score(y_test, y_pred_nn)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_nn))

'''
## SAVING MODEL AS PICKLE FILE
filename = 'model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(nn_model, file)
'''



Accuracy: 0.97

Classification Report:
                  precision    recall  f1-score   support

Hyperthyroidism       0.86      0.42      0.56        43
 Hypothyroidism       0.87      0.84      0.85       115
         Normal       0.97      0.99      0.98      1615

       accuracy                           0.97      1773
      macro avg       0.90      0.75      0.80      1773
   weighted avg       0.96      0.97      0.96      1773



In [63]:
print(x_test)
print(y_test)

      age  sex  on_thyroxine  on_antithyroid_meds  sick  pregnant  \
1355   28    0             0                    0     0         0   
5127   68    0             1                    0     0         0   
7771   51    0             0                    0     0         0   
3696   72    0             1                    0     0         0   
9007   73    0             0                    0     0         0   
...   ...  ...           ...                  ...   ...       ...   
6259   58    1             1                    0     0         0   
9145   80    1             0                    0     0         0   
9078   67    1             0                    0     0         0   
4311   71    0             1                    0     0         0   
8057   59    0             1                    0     0         0   

      thyroid_surgery  I131_treatment  query_hypothyroid  query_hyperthyroid  \
1355                0               0                  0                   0   
5127       