In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn import tree
from sklearn.model_selection import GridSearchCV


In [6]:
!pip install pandas_profiling



In [7]:
!pip install imblearn



In [8]:
!pip install graphviz



In [9]:
pwd

'/content'

In [10]:
df = pd.read_csv("hypothyroid.csv")

In [11]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,41,23,46,70,70
sex,F,F,M,F,F
on_thyroxine,f,f,f,t,f
query_on_thyroxine,f,f,f,f,f
on_antithyroid_medication,f,f,f,f,f
sick,f,f,f,f,f
pregnant,f,f,f,f,f
thyroid_surgery,f,f,f,f,f
I131_treatment,f,f,f,f,f
query_hypothyroid,f,f,f,f,f


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        3772 non-null   object
 1   sex                        3772 non-null   object
 2   on_thyroxine               3772 non-null   object
 3   query_on_thyroxine         3772 non-null   object
 4   on_antithyroid_medication  3772 non-null   object
 5   sick                       3772 non-null   object
 6   pregnant                   3772 non-null   object
 7   thyroid_surgery            3772 non-null   object
 8   I131_treatment             3772 non-null   object
 9   query_hypothyroid          3772 non-null   object
 10  query_hyperthyroid         3772 non-null   object
 11  lithium                    3772 non-null   object
 12  goitre                     3772 non-null   object
 13  tumor                      3772 non-null   object
 14  hypopitu

In [13]:
#No missing Values
#Highly imbalance dataset
#datset consist ? need to replace it with NaN then use KNN imputer
#Features are categorical variable so need to perform one hot encoding
#Target variable is one having multiple categories os use LabelEncoding

In [14]:
df.columns

Index(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'TSH',
       'T3_measured', 'T3', 'TT4_measured', 'TT4', 'T4U_measured', 'T4U',
       'FTI_measured', 'FTI', 'TBG_measured', 'TBG', 'referral_source',
       'Class'],
      dtype='object')

In [15]:
col = ['sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured',
       'T3_measured', 'TT4_measured', 'T4U_measured',
       'FTI_measured', 'TBG_measured', 'referral_source',]


# Replacing ? with NaN

In [16]:
for i in df.columns:
    count = df[i][df[i] == "?"].count()
    if count!=0:
        df[i] = df[i].replace("?", np.NaN)

In [17]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,41,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.3,t,2.5,t,125,t,1.14,t,109,f,,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.1,t,2,t,102,f,,f,,f,,other,negative
2,46,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.98,f,,t,109,t,0.91,t,120,f,,other,negative
3,70,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.16,t,1.9,t,175,f,,f,,f,,other,negative
4,70,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.72,t,1.2,t,61,t,0.87,t,70,f,,SVI,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,F,f,f,f,f,f,f,f,f,f,f,f,t,f,f,f,,f,,f,,f,,f,,f,,other,negative
3768,68,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1,t,2.1,t,124,t,1.08,t,114,f,,SVI,negative
3769,74,F,f,f,f,f,f,f,f,f,t,f,f,f,f,f,t,5.1,t,1.8,t,112,t,1.07,t,105,f,,other,negative
3770,72,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.7,t,2,t,82,t,0.94,t,87,f,,SVI,negative


In [18]:
df["TBG"].nunique()

0

In TBG column all values are missing so dropping this column.

In [19]:
df.drop("TBG", axis = 1, inplace = True)

In [20]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,referral_source,Class
0,41,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.3,t,2.5,t,125,t,1.14,t,109,f,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.1,t,2,t,102,f,,f,,f,other,negative
2,46,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.98,f,,t,109,t,0.91,t,120,f,other,negative
3,70,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.16,t,1.9,t,175,f,,f,,f,other,negative
4,70,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.72,t,1.2,t,61,t,0.87,t,70,f,SVI,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,F,f,f,f,f,f,f,f,f,f,f,f,t,f,f,f,,f,,f,,f,,f,,f,other,negative
3768,68,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1,t,2.1,t,124,t,1.08,t,114,f,SVI,negative
3769,74,F,f,f,f,f,f,f,f,f,t,f,f,f,f,f,t,5.1,t,1.8,t,112,t,1.07,t,105,f,other,negative
3770,72,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.7,t,2,t,82,t,0.94,t,87,f,SVI,negative


In [21]:
df[["T3_measured", "T3", "TT4_measured", "TT4"]].head(10)

Unnamed: 0,T3_measured,T3,TT4_measured,TT4
0,t,2.5,t,125
1,t,2.0,t,102
2,f,,t,109
3,t,1.9,t,175
4,t,1.2,t,61
5,f,,t,183
6,f,,t,72
7,t,0.6,t,80
8,t,2.2,t,123
9,t,1.6,t,83


So In T3 measured column if there is yes then T3 column have entry otherwise there is NaN and same in TT4_measured. So T3_measured, TT4_measured just indicate whether there is entry in next column or not. That is why i will drop them.

In [22]:
df = df.drop(['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured'], axis = 1)

In [23]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,Class
0,41,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,1.3,2.5,125,1.14,109,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,4.1,2,102,,,other,negative
2,46,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.98,,109,0.91,120,other,negative
3,70,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,0.16,1.9,175,,,other,negative
4,70,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.72,1.2,61,0.87,70,SVI,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,F,f,f,f,f,f,f,f,f,f,f,f,t,f,f,,,,,,other,negative
3768,68,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,1,2.1,124,1.08,114,SVI,negative
3769,74,F,f,f,f,f,f,f,f,f,t,f,f,f,f,f,5.1,1.8,112,1.07,105,other,negative
3770,72,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.7,2,82,0.94,87,SVI,negative


In [24]:
df.isnull().sum()

age                            1
sex                          150
on_thyroxine                   0
query_on_thyroxine             0
on_antithyroid_medication      0
sick                           0
pregnant                       0
thyroid_surgery                0
I131_treatment                 0
query_hypothyroid              0
query_hyperthyroid             0
lithium                        0
goitre                         0
tumor                          0
hypopituitary                  0
psych                          0
TSH                          369
T3                           769
TT4                          231
T4U                          387
FTI                          385
referral_source                0
Class                          0
dtype: int64

# One hot Encoding by get_dummies

In [25]:
cols = ['on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych']
for col in cols:
    df[col] = pd.get_dummies(df[col])

In [26]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,Class
0,41,F,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1.3,2.5,125,1.14,109,SVHC,negative
1,23,F,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4.1,2,102,,,other,negative
2,46,M,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.98,,109,0.91,120,other,negative
3,70,F,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0.16,1.9,175,,,other,negative
4,70,F,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.72,1.2,61,0.87,70,SVI,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,F,1,1,1,1,1,1,1,1,1,1,1,0,1,1,,,,,,other,negative
3768,68,F,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.1,124,1.08,114,SVI,negative
3769,74,F,1,1,1,1,1,1,1,1,0,1,1,1,1,1,5.1,1.8,112,1.07,105,other,negative
3770,72,M,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.7,2,82,0.94,87,SVI,negative


# Handling sex columns

In [27]:
df["sex"] = df["sex"].map({"F" :0, "M":1})
df


Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,Class
0,41,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1.3,2.5,125,1.14,109,SVHC,negative
1,23,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4.1,2,102,,,other,negative
2,46,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.98,,109,0.91,120,other,negative
3,70,0.0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0.16,1.9,175,,,other,negative
4,70,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.72,1.2,61,0.87,70,SVI,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,0.0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,,,,,,other,negative
3768,68,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.1,124,1.08,114,SVI,negative
3769,74,0.0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,5.1,1.8,112,1.07,105,other,negative
3770,72,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.7,2,82,0.94,87,SVI,negative


# Dropping the referral source column

In [28]:
df = df.drop("referral_source", axis = 1)

In [29]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,Class
0,41,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1.3,2.5,125,1.14,109,negative
1,23,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4.1,2,102,,,negative
2,46,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.98,,109,0.91,120,negative
3,70,0.0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0.16,1.9,175,,,negative
4,70,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.72,1.2,61,0.87,70,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,0.0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,,,,,,negative
3768,68,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.1,124,1.08,114,negative
3769,74,0.0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,5.1,1.8,112,1.07,105,negative
3770,72,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.7,2,82,0.94,87,negative


In [30]:
df["Class"].value_counts()

negative                   3481
compensated_hypothyroid     194
primary_hypothyroid          95
secondary_hypothyroid         2
Name: Class, dtype: int64

# Converting the class by using Label Encoder

In [31]:
encode = LabelEncoder()

In [32]:
df["Class"] = encode.fit_transform(df["Class"])

In [33]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,Class
0,41,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1.3,2.5,125,1.14,109,1
1,23,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4.1,2,102,,,1
2,46,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.98,,109,0.91,120,1
3,70,0.0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0.16,1.9,175,,,1
4,70,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.72,1.2,61,0.87,70,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,0.0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,,,,,,1
3768,68,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.1,124,1.08,114,1
3769,74,0.0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,5.1,1.8,112,1.07,105,1
3770,72,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.7,2,82,0.94,87,1


In [34]:
df["Class"].value_counts()

1    3481
0     194
2      95
3       2
Name: Class, dtype: int64

# Using KNN imputer to fill missing values

In [35]:
impute = KNNImputer(n_neighbors = 3, missing_values = np.nan,weights='uniform',metric='nan_euclidean')

In [36]:
new_array = impute.fit_transform(df)

In [37]:
new_df = pd.DataFrame(data = np.round(new_array), columns=df.columns )

In [38]:
new_df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,Class
0,41.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,125.0,1.0,109.0,1.0
1,23.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,2.0,102.0,1.0,108.0,1.0
2,46.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,109.0,1.0,120.0,1.0
3,70.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,175.0,1.0,177.0,1.0
4,70.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,61.0,1.0,70.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,2.0,3.0,112.0,1.0,117.0,1.0
3768,68.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,124.0,1.0,114.0,1.0
3769,74.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,5.0,2.0,112.0,1.0,105.0,1.0
3770,72.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,82.0,1.0,87.0,1.0


# Plotting the distribution curve

In [40]:
columns = ['age','TSH','T3','TT4','T4U','FTI']
plotnumber = 1
plt.figure(figsize=(10,15),facecolor='white')
for col in columns:
    ax = plt.subplot(3,2,plotnumber)
    sns.distplot(new_df[col])
    plt.xlabel(col)
    plotnumber +=1



Do some trasformation in the data. 

In [41]:
columns = ['age','TSH','T3','TT4','T4U','FTI']
plotnumber = 1
plt.figure(figsize=(10,15),facecolor='white')
for col in columns:
    ax = plt.subplot(3,2,plotnumber)
    sns.distplot(np.log(new_df[col]+1))
    plt.xlabel(col)
    plotnumber +=1



After doing the transformation TSH has weird trend so Drop this column.

In [42]:
new_df = new_df.drop("TSH", axis = 1)

In [43]:
new_df.columns

Index(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'T3', 'TT4', 'T4U', 'FTI',
       'Class'],
      dtype='object')

# Handling Imbalance data

In [44]:
sns.countplot(new_df["Class"])



<matplotlib.axes._subplots.AxesSubplot at 0x7f30402a3e90>

Data is highly imbalance so we will ue SMOTE(Synthetic Minority Oversampling technique) tp create synthetic dataset.

In [45]:
from imblearn.over_sampling import SMOTE



In [46]:
x = new_df.drop("Class", axis = 1)
y = new_df["Class"]
oversample = SMOTE(random_state = 42, k_neighbors = 1)
x_sampled, y_sampled = oversample.fit_resample(x, y)



In [47]:
x_sampled.shape

(13924, 20)

In [48]:
y_sampled.shape

(13924,)

In [49]:
sns.countplot(y_sampled)



<matplotlib.axes._subplots.AxesSubplot at 0x7f30402a3e90>

# Model Building

In [50]:
x_sampled = pd.DataFrame(x_sampled, columns = ['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'T3', 'TT4', 'T4U', 'FTI'])

In [51]:
y_sampled = pd.DataFrame(y_sampled, columns = ["CLass"])
y_sampled

Unnamed: 0,CLass
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
13919,3.0
13920,3.0
13921,3.0
13922,3.0


In [52]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_sampled, y_sampled, test_size = 0.2, random_state = 42)

In [53]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(11139, 20)
(11139, 1)
(2785, 20)
(2785, 1)


In [54]:
model1 = LogisticRegression(multi_class = 'multinomial', penalty = 'l2', solver = 'lbfgs', verbose = 1, max_iter = 1000)

In [55]:
model1.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.0s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=1,
                   warm_start=False)

In [56]:
score = model1.score(x_test, y_test)
print("Test Accuracy Score", score)

Test Accuracy Score 0.8064631956912028


In [57]:
y_pred = model1.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8064631956912028

In [58]:
y_pred

array([3., 2., 3., ..., 3., 2., 0.])

In [59]:
confusion_matrix(y_test, y_pred)

array([[526, 169,  11,  13],
       [235, 440,   7,   7],
       [  5,  25, 643,  14],
       [  0,   0,  53, 637]])

In [60]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.69      0.73      0.71       719
         1.0       0.69      0.64      0.67       689
         2.0       0.90      0.94      0.92       687
         3.0       0.95      0.92      0.94       690

    accuracy                           0.81      2785
   macro avg       0.81      0.81      0.81      2785
weighted avg       0.81      0.81      0.81      2785



In [61]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model1, x_train, y_train, cv=10)
print('Cross-Validation Accuracy Scores', scores)


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.4s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done

Cross-Validation Accuracy Scores [0.83034111 0.80789946 0.82675045 0.83123878 0.80969479 0.82854578
 0.82046679 0.83752244 0.82226212 0.82659479]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.3s finished


In [62]:
scores

array([0.83034111, 0.80789946, 0.82675045, 0.83123878, 0.80969479,
       0.82854578, 0.82046679, 0.83752244, 0.82226212, 0.82659479])

In [63]:
scores.mean()

0.824131651237779

In [64]:
from sklearn.tree import DecisionTreeClassifier

model2 = DecisionTreeClassifier()

In [65]:
path = model2.cost_complexity_pruning_path(x_train, y_train)

In [66]:
path

{'ccp_alphas': array([0.00000000e+00, 4.47817155e-05, 5.44088882e-05, 5.78547845e-05,
        5.88178844e-05, 5.88353741e-05, 5.89823890e-05, 5.91199017e-05,
        5.95701052e-05, 5.98188789e-05, 5.98497771e-05, 5.98497771e-05,
        6.58241337e-05, 6.66877094e-05, 6.99782009e-05, 7.10716103e-05,
        7.18197325e-05, 7.18197325e-05, 7.48122213e-05, 7.59631786e-05,
        7.69497134e-05, 7.78047102e-05, 7.85528324e-05, 7.97997027e-05,
        8.07971990e-05, 8.16133324e-05, 8.35833093e-05, 8.37896879e-05,
        8.37896879e-05, 8.44938029e-05, 8.44938029e-05, 8.47871842e-05,
        8.47871842e-05, 8.52859323e-05, 8.54996815e-05, 8.60340545e-05,
        8.65684275e-05, 8.67109270e-05, 8.67419078e-05, 8.69692073e-05,
        8.70123682e-05, 8.70542212e-05, 8.75302989e-05, 8.80482297e-05,
        8.82530611e-05, 8.82784212e-05, 8.85277952e-05, 8.87544989e-05,
        8.89030669e-05, 8.90108485e-05, 8.91639536e-05, 8.93280255e-05,
        8.97746656e-05, 8.97746656e-05, 9.57596433

In [67]:
ccp_alpha = path["ccp_alphas"]

In [68]:
dt_model2 = []

for ccp in ccp_alpha:
    dt_m = DecisionTreeClassifier(ccp_alpha = ccp)
    dt_m.fit(x_train, y_train)
    dt_model2.append(dt_m)

In [69]:
print(len(dt_model2))
dt_model2

199


[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=4.478171554105035e-05, class_weight=None,
                        criterion='gini', max_depth=None, max_features=None,
                        max_leaf_nodes=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        presort='deprecated', random_state=None,
                        splitter='best'),
 DecisionTreeClassifier(ccp_alpha=5.4408888235982235e-05, class_weight=None,
             

In [70]:
train_score = [model.score(x_train, y_train) for model in dt_model2]

In [71]:
test_score =  [model.score(x_test, y_test) for model in dt_model2]

In [72]:
fig, ax = plt.subplots()
ax.plot(ccp_alpha, train_score,'r-+', label = "Train_score", drawstyle='steps-post')
ax.plot(ccp_alpha, test_score, 'b-+', label = "Test_score", drawstyle = 'steps-post')
plt.xlabel("Alpha")
plt.ylabel("Accuracy")
ax.legend()

<matplotlib.legend.Legend at 0x7f303ff2c2d0>

In [73]:
dt_model_ccp = DecisionTreeClassifier(ccp_alpha = .01, random_state = 0)
dt_model_ccp.fit(x_train, y_train)
dt_model_ccp.score(x_test, y_test)

0.8506283662477558

In [74]:
plt.figure(figsize = (20, 20))
tree.plot_tree(dt_model_ccp, filled = True)

[Text(472.15384615384613, 978.48, 'X[19] <= 63.673\ngini = 0.75\nsamples = 11139\nvalue = [2762, 2792, 2794, 2791]'),
 Text(171.69230769230768, 761.0400000000001, 'X[19] <= 53.972\ngini = 0.507\nsamples = 5542\nvalue = [8, 31, 2712, 2791]'),
 Text(85.84615384615384, 543.6, 'gini = 0.013\nsamples = 2163\nvalue = [0, 14, 2149, 0]'),
 Text(257.53846153846155, 543.6, 'X[17] <= 49.0\ngini = 0.29\nsamples = 3379\nvalue = [8, 17, 563, 2791]'),
 Text(171.69230769230768, 326.1600000000001, 'gini = 0.016\nsamples = 2813\nvalue = [0, 1, 21, 2791]'),
 Text(343.38461538461536, 326.1600000000001, 'gini = 0.082\nsamples = 566\nvalue = [8, 16, 542, 0]'),
 Text(772.6153846153845, 761.0400000000001, 'X[19] <= 97.997\ngini = 0.514\nsamples = 5597\nvalue = [2754, 2761, 82, 0]'),
 Text(600.9230769230769, 543.6, 'X[1] <= 1.0\ngini = 0.424\nsamples = 2560\nvalue = [1783, 770, 7, 0]'),
 Text(515.0769230769231, 326.1600000000001, 'gini = 0.367\nsamples = 2229\nvalue = [1692, 530, 7, 0]'),
 Text(686.76923076923

In [75]:
model2.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [76]:
grid_param = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': range(2, 10 , 1),
    'min_samples_split': range(2, 5, 1),
    'min_samples_leaf': range(1, 10, 1), 
    'ccp_alpha': np.random.rand(10)
}

In [77]:
grid_ccp = GridSearchCV(dt_model_ccp, param_grid = grid_param,cv = 5, n_jobs = -1 )

In [78]:
grid_ccp.fit(x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.01, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=0, splitter='best'),
             iid='depr...
             param_grid={'ccp_alpha': array([0.7929826 , 0.62208338, 0.06725439, 0.06649525, 0.82780235,
       0.57929064, 0.62711947, 0.490442

In [79]:
grid_ccp.best_params_

{'ccp_alpha': 0.0672543872443554,
 'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}

In [99]:
model3 = DecisionTreeClassifier(ccp_alpha = 0.06, criterion = 'gini', max_depth = 3, min_samples_leaf=1, min_samples_split=2, splitter='best')

In [100]:
model3.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.06, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [101]:
model3.predict(x_test)

array([3., 2., 3., ..., 3., 2., 1.])

In [102]:
model3.score(x_test, y_test)

0.7249551166965889

In [104]:
from sklearn.ensemble import RandomForestClassifier

In [105]:
model4 = RandomForestClassifier()

In [107]:
model4.fit(x_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [108]:
model4.predict(x_test)

array([3., 2., 3., ..., 3., 2., 0.])

In [109]:
model4.score(x_test, y_test)

0.984560143626571

In [110]:
new_df.loc[0]

age                           41.0
sex                            0.0
on_thyroxine                   1.0
query_on_thyroxine             1.0
on_antithyroid_medication      1.0
sick                           1.0
pregnant                       1.0
thyroid_surgery                1.0
I131_treatment                 1.0
query_hypothyroid              1.0
query_hyperthyroid             1.0
lithium                        1.0
goitre                         1.0
tumor                          1.0
hypopituitary                  1.0
psych                          1.0
T3                             2.0
TT4                          125.0
T4U                            1.0
FTI                          109.0
Class                          1.0
Name: 0, dtype: float64

In [111]:
model4.predict([[41.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,1.0, 1.0, 1.0,1.0, 1.0, 1.0,1.0, 2.0,125.0, 1.0, 109.0 ]])

array([1.])

In [112]:
import pickle

pickle.dump(model4, open("Thyroid_model.pkl", "wb"))