In [3]:
import pandas as pd

# Load the dataset (update the filename if needed)
df = pd.read_csv("hypothyroid.csv")  # Replace with actual filename

# Display first few rows
df.head()


Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,P
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,P
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,P
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,P
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,P


In [4]:
# Get basic info about the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        3772 non-null   object
 1   sex                        3772 non-null   object
 2   on thyroxine               3772 non-null   object
 3   query on thyroxine         3772 non-null   object
 4   on antithyroid medication  3772 non-null   object
 5   sick                       3772 non-null   object
 6   pregnant                   3772 non-null   object
 7   thyroid surgery            3772 non-null   object
 8   I131 treatment             3772 non-null   object
 9   query hypothyroid          3772 non-null   object
 10  query hyperthyroid         3772 non-null   object
 11  lithium                    3772 non-null   object
 12  goitre                     3772 non-null   object
 13  tumor                      3772 non-null   object
 14  hypopitu

In [5]:
df["sex"] = df["sex"].replace({"M": 0, "F": 1})

yes_no_cols = ["on thyroxine", "query on thyroxine", "on antithyroid medication", "sick", "pregnant", 
               "thyroid surgery", "I131 treatment", "query hypothyroid", "query hyperthyroid", "lithium", 
               "goitre", "tumor", "hypopituitary", "psych", "TSH measured", "T3 measured", "TT4 measured", 
               "T4U measured", "FTI measured", "TBG measured"]

df[yes_no_cols] = df[yes_no_cols].replace({"t": 1, "f": 0})

df["binaryClass"] = df["binaryClass"].replace({"N": 0, "P": 1})

import numpy as np

# Replace "?" with NaN
df.replace("?", np.nan, inplace=True)

# Convert numerical columns
num_cols = ["age", "TSH", "T3", "TT4", "T4U", "FTI", "TBG"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

# Fill missing numerical values with the mean
for col in num_cols:
    df[col].fillna(df[col].mean(), inplace=True)

df.drop(columns=["referral source"], inplace=True)


In [6]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,binaryClass
0,41.0,1.0,0,0,0,0,0,0,0,0,...,2.5,1,125.0,1,1.14,1,109.0,0,,1
1,23.0,1.0,0,0,0,0,0,0,0,0,...,2.0,1,102.0,0,0.995,0,110.469649,0,,1
2,46.0,0.0,0,0,0,0,0,0,0,0,...,2.0135,1,109.0,1,0.91,1,120.0,0,,1
3,70.0,1.0,1,0,0,0,0,0,0,0,...,1.9,1,175.0,0,0.995,0,110.469649,0,,1
4,70.0,1.0,0,0,0,0,0,0,0,0,...,1.2,1,61.0,1,0.87,1,70.0,0,,1


In [7]:
# Drop irrelevant or redundant columns
drop_cols = ["query on thyroxine", "on antithyroid medication", "sick", "pregnant", "thyroid surgery", 
             "I131 treatment", "query hypothyroid", "query hyperthyroid", "lithium", "goitre", 
             "tumor", "hypopituitary", "psych", "TSH measured", "T3 measured", "TT4 measured", 
             "T4U measured", "FTI measured", "TBG measured", "TBG"]

df.drop(columns=drop_cols, inplace=True)

# Check remaining columns
print(df.columns)


Index(['age', 'sex', 'on thyroxine', 'TSH', 'T3', 'TT4', 'T4U', 'FTI',
       'binaryClass'],
      dtype='object')


In [8]:
print(df.isnull().sum())  # Should now show 0 missing values
print(df.info())  # Confirm all data types are correct


age               0
sex             150
on thyroxine      0
TSH               0
T3                0
TT4               0
T4U               0
FTI               0
binaryClass       0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           3772 non-null   float64
 1   sex           3622 non-null   float64
 2   on thyroxine  3772 non-null   int64  
 3   TSH           3772 non-null   float64
 4   T3            3772 non-null   float64
 5   TT4           3772 non-null   float64
 6   T4U           3772 non-null   float64
 7   FTI           3772 non-null   float64
 8   binaryClass   3772 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 265.3 KB
None


In [9]:
# Fill missing values in "sex" with mode (most frequent value)
df["sex"].fillna(df["sex"].mode()[0], inplace=True)

# Check if missing values are gone
print(df.isnull().sum())  


age             0
sex             0
on thyroxine    0
TSH             0
T3              0
TT4             0
T4U             0
FTI             0
binaryClass     0
dtype: int64


In [10]:
from sklearn.model_selection import train_test_split

# Features (inputs) and target (output)
X = df.drop(columns=["binaryClass"])  # Features
y = df["binaryClass"]  # Target variable (0 = Normal, 1 = Thyroid Disease)

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check sizes
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)



Training set size: (3017, 8)
Testing set size: (755, 8)


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Model Accuracy:", accuracy_rf)

# Classification report
print(classification_report(y_test, y_pred_rf))


Random Forest Model Accuracy: 0.9973509933774835
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        58
           1       1.00      1.00      1.00       697

    accuracy                           1.00       755
   macro avg       0.98      1.00      0.99       755
weighted avg       1.00      1.00      1.00       755



In [12]:
import joblib

# Save the trained model
joblib.dump(rf_model, "thyroid_model.joblib")

print("Thyroid disease model saved successfully!")


Thyroid disease model saved successfully!
