#### Importing libraries

In [247]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import pickle

#### Reading Data

In [248]:
data=pd.read_csv('C:/Users/User/Documents/Medical_Diagnosis_System/project/Datasets/hypothyroid.csv',na_values=["?"])

In [249]:
data.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,...,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41.0,F,f,f,f,f,...,t,109.0,f,,SVHC,P
1,23.0,F,f,f,f,f,...,f,,f,,other,P
2,46.0,M,f,f,f,f,...,t,120.0,f,,other,P
3,70.0,F,t,f,f,f,...,f,,f,,other,P
4,70.0,F,f,f,f,f,...,t,70.0,f,,SVI,P


#### Data Analysis

In [250]:
data.shape

(3772, 30)

In [251]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        3771 non-null   float64
 1   sex                        3622 non-null   object 
 2   on thyroxine               3772 non-null   object 
 3   query on thyroxine         3772 non-null   object 
 4   on antithyroid medication  3772 non-null   object 
 5   sick                       3772 non-null   object 
 6   pregnant                   3772 non-null   object 
 7   thyroid surgery            3772 non-null   object 
 8   I131 treatment             3772 non-null   object 
 9   query hypothyroid          3772 non-null   object 
 10  query hyperthyroid         3772 non-null   object 
 11  lithium                    3772 non-null   object 
 12  goitre                     3772 non-null   object 
 13  tumor                      3772 non-null   objec

In [252]:
data.describe()

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG
count,3771.0,3403.0,3003.0,3541.0,3385.0,3387.0,0.0
mean,51.735879,5.086766,2.0135,108.319345,0.995,110.469649,
std,20.084958,24.52147,0.827434,35.604248,0.195457,33.089698,
min,1.0,0.005,0.05,2.0,0.25,2.0,
25%,36.0,0.5,1.6,88.0,0.88,93.0,
50%,54.0,1.4,2.0,103.0,0.98,107.0,
75%,67.0,2.7,2.4,124.0,1.08,124.0,
max,455.0,530.0,10.6,430.0,2.32,395.0,


In [253]:
data["binaryClass"].value_counts()

binaryClass
P    3481
N     291
Name: count, dtype: int64

In [254]:
# dropping columns that have most missing values
data=data.drop(columns="TBG",axis=1)
data

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,...,T4U,FTI measured,FTI,TBG measured,referral source,binaryClass
0,41.0,F,f,f,f,f,...,1.14,t,109.0,f,SVHC,P
1,23.0,F,f,f,f,f,...,,f,,f,other,P
2,46.0,M,f,f,f,f,...,0.91,t,120.0,f,other,P
3,70.0,F,t,f,f,f,...,,f,,f,other,P
4,70.0,F,f,f,f,f,...,0.87,t,70.0,f,SVI,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,F,f,f,f,f,...,,f,,f,other,P
3768,68.0,F,f,f,f,f,...,1.08,t,114.0,f,SVI,P
3769,74.0,F,f,f,f,f,...,1.07,t,105.0,f,other,P
3770,72.0,M,f,f,f,f,...,0.94,t,87.0,f,SVI,P


In [255]:
# checking null values
data.isnull().sum()

age                            1
sex                          150
on thyroxine                   0
query on thyroxine             0
on antithyroid medication      0
sick                           0
pregnant                       0
thyroid surgery                0
I131 treatment                 0
query hypothyroid              0
query hyperthyroid             0
lithium                        0
goitre                         0
tumor                          0
hypopituitary                  0
psych                          0
TSH measured                   0
TSH                          369
T3 measured                    0
T3                           769
TT4 measured                   0
TT4                          231
T4U measured                   0
T4U                          387
FTI measured                   0
FTI                          385
TBG measured                   0
referral source                0
binaryClass                    0
dtype: int64

#### Data Preprocessing

In [256]:
# converting objects to numericals
data["binaryClass"]=data["binaryClass"].map({"P":0,"N":1})
data=data.replace({"t":1,"f":0,"M":0,"F":1})
data

  data=data.replace({"t":1,"f":0,"M":0,"F":1})


Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,...,T4U,FTI measured,FTI,TBG measured,referral source,binaryClass
0,41.0,1.0,0,0,0,0,...,1.14,1,109.0,0,SVHC,0
1,23.0,1.0,0,0,0,0,...,,0,,0,other,0
2,46.0,0.0,0,0,0,0,...,0.91,1,120.0,0,other,0
3,70.0,1.0,1,0,0,0,...,,0,,0,other,0
4,70.0,1.0,0,0,0,0,...,0.87,1,70.0,0,SVI,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,1.0,0,0,0,0,...,,0,,0,other,0
3768,68.0,1.0,0,0,0,0,...,1.08,1,114.0,0,SVI,0
3769,74.0,1.0,0,0,0,0,...,1.07,1,105.0,0,other,0
3770,72.0,0.0,0,0,0,0,...,0.94,1,87.0,0,SVI,0


In [257]:
data["binaryClass"].value_counts()

binaryClass
0    3481
1     291
Name: count, dtype: int64

In [258]:
# dropping unwanted columns
data=data.drop("referral source",axis=1)
data

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,...,T4U measured,T4U,FTI measured,FTI,TBG measured,binaryClass
0,41.0,1.0,0,0,0,0,...,1,1.14,1,109.0,0,0
1,23.0,1.0,0,0,0,0,...,0,,0,,0,0
2,46.0,0.0,0,0,0,0,...,1,0.91,1,120.0,0,0
3,70.0,1.0,1,0,0,0,...,0,,0,,0,0
4,70.0,1.0,0,0,0,0,...,1,0.87,1,70.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,1.0,0,0,0,0,...,0,,0,,0,0
3768,68.0,1.0,0,0,0,0,...,1,1.08,1,114.0,0,0
3769,74.0,1.0,0,0,0,0,...,1,1.07,1,105.0,0,0
3770,72.0,0.0,0,0,0,0,...,1,0.94,1,87.0,0,0


In [259]:
data.dtypes

age                          float64
sex                          float64
on thyroxine                   int64
query on thyroxine             int64
on antithyroid medication      int64
sick                           int64
pregnant                       int64
thyroid surgery                int64
I131 treatment                 int64
query hypothyroid              int64
query hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                  int64
psych                          int64
TSH measured                   int64
TSH                          float64
T3 measured                    int64
T3                           float64
TT4 measured                   int64
TT4                          float64
T4U measured                   int64
T4U                          float64
FTI measured                   int64
FTI                          float64
TBG measured                   int64
b

In [260]:
# filling the missing values using SimpleImputer
imputer=SimpleImputer(strategy='mean')
data["age"]=imputer.fit_transform(data[["age"]])
data["sex"]=imputer.fit_transform(data[["sex"]])
data["TSH"]=imputer.fit_transform(data[["TSH"]])
data["T3"]=imputer.fit_transform(data[["T3"]])
data["TT4"]=imputer.fit_transform(data[["TT4"]])
data["T4U"]=imputer.fit_transform(data[["T4U"]])
data["FTI"]=imputer.fit_transform(data[["FTI"]])

In [261]:
# checking missing values after imputation
data.isnull().sum()

age                          0
sex                          0
on thyroxine                 0
query on thyroxine           0
on antithyroid medication    0
sick                         0
pregnant                     0
thyroid surgery              0
I131 treatment               0
query hypothyroid            0
query hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH measured                 0
TSH                          0
T3 measured                  0
T3                           0
TT4 measured                 0
TT4                          0
T4U measured                 0
T4U                          0
FTI measured                 0
FTI                          0
TBG measured                 0
binaryClass                  0
dtype: int64

#### Splitting features and target variable

In [262]:
X=data.drop(columns="binaryClass",axis=1)
Y=data["binaryClass"]

#### Splitting training and testing data

In [274]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=7)

In [275]:
# dropping unnecessary or redundant features
X_train=X_train.drop(['FTI', 'FTI measured', 'T4U measured', 'TT4 measured','query on thyroxine','on antithyroid medication','sick', 'pregnant','thyroid surgery','I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary','psych' , 'TSH measured', 'T4U', 'TBG measured'],axis=1)
X_test=X_test.drop(['FTI', 'FTI measured', 'T4U measured', 'TT4 measured','query on thyroxine','on antithyroid medication','sick', 'pregnant','thyroid surgery','I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary','psych' , 'TSH measured', 'T4U', 'TBG measured'],axis=1)

#### Model creation

In [276]:
tmodel=LogisticRegression(max_iter=712)
tmodel.fit(X_train,Y_train)

In [277]:
Train_data_predict=tmodel.predict(X_train)
Train_data_accuracy=accuracy_score(Y_train,Train_data_predict)
print("Train data accuracy:",Train_data_accuracy)
Test_data_predict=tmodel.predict(X_test)
Test_data_accuracy=accuracy_score(Y_test,Test_data_predict)
print("Test data accuracy:",Test_data_accuracy)

Train data accuracy: 0.957905203844879
Test data accuracy: 0.9668874172185431


#### Predictive System

In [267]:
input=(44,0,0,45,1,1.4,39)
# transforming as array
input_array=np.asarray(input)
# reshaping the array
reshaped_input=input_array.reshape(1,-1)
prediction=tmodel.predict(reshaped_input)
if (prediction[0]==0):
    print("The person does not have HyperThyroid disease.")
else:
    print("The person has HyperThyroid disease.")

The person has HyperThyroid disease.




#### Saving the trained model

In [269]:
pickle.dump(tmodel,open('Thyroid_Disease_Prediction.sav','wb'))
# loading the model
loaded_model=pickle.load(open('Thyroid_Disease_Prediction.sav','rb'))
for column in X_train.columns:
    print(column)

age
sex
on thyroxine
TSH
T3 measured
T3
TT4
