In [None]:
import pandas as pd#import the pandas module becuase our dataset is a csv file
import numpy as np #import the numpy module becuase our dataset is a csv file
import matplotlib.pyplot as plt#import the matplot library to plot the heatmap to see the correlation
import seaborn as sns#import the seaborn library to plot the heatmap to see the correlation

In [None]:
# Use the correct fcols list, specify no header, and replace '?' with NaN.
fcols = [
    "age", "sex", "on_thyroxine", "query_on_thyroxine", "on_antithyroid_medication",
    "sick", "pregnant", "thyroid_surgery", "I131_treatment", "query_hypothyroid",
    "query_hyperthyroid", "lithium", "goitre", "tumor", "hypopituitary", "psych",
    "TSH measured", "TSH", "T3_measured", "T3", "TT4_measured", "TT4",
    "T4U_measured", "T4U", "FTI_measured", "FTI", "TBG_measured", "TBG", "target", "source_id" # source_id for the extra column
]

In [None]:
# --- REPLACEMENT FOR CELL 3 ---
# This line correctly reads the file, applies the names from Cell 2, 
# specifies no header (header=None), and replaces '?' with NaN.

dataframe = pd.read_csv("thyroid.csv", header=None, names=fcols, na_values=['?'])

In [None]:
dataframe #here we are rpinting the dataframe

In [None]:
# --- FIX FOR CELL 5 ---
dataframe.drop("source_id",axis=1,inplace=True) # Dropping the correct column name.

In [None]:
fcols = ["age",
                "sex",
                "on_thyroxine",
                "query_on_thyroxine",
                "on_antithyroid_medication",
                "sick",
                "pregnant",
                "thyroid_surgery",
                "I131_treatment",
                "query_hypothyroid",
                "query_hyperthyroid",
                "lithium",
                "goitre",
                "tumor",
                "hypopituitary",
                "psych",
                "TSH measured",
                "TSH",
                "T3_measured",
                "T3",
                "TT4_measured",
                "TT4",
                "T4U_measured",
                "T4U",
                "FTI_measured",
                "FTI",
                "TBG_measured",
                "TBG",
               "target"]

In [None]:
dataframe  #printing the dataset first 5 rows again to see the dataset

In [None]:
target = df.target #here we are storing our target into a target variable
create = target.str.split('([A-Za-z]+)', expand=True) #here we are trying to split the target into create
create = create[1] #here we took the 1st data of the create becuase it is in a string format
target = create.replace({None:'Z'}) #Z is no a type of thyroid disease
df.target = target #storing the target into our target dataset column again

In [None]:
target_clean = dataframe['target'].astype(str)
target_clean = target_clean.str.split('([A-Za-z]+)', expand=True)[1]
dataframe['target'] = target_clean.replace({None: 'Z'})

In [None]:
binary_cols = [
    "on_thyroxine", "query_on_thyroxine", "on_antithyroid_medication", 
    "sick", "pregnant", "thyroid_surgery", "I131_treatment", 
    "query_hypothyroid", "query_hyperthyroid", "lithium", "goitre", 
    "tumor", "hypopituitary", "psych"
]

for col in binary_cols:
    dataframe[col] = dataframe[col].replace({'t': 1, 'f': 0})

In [None]:
# FIX: Change 'df' to 'dataframe'
dataframe.target.unique()

In [None]:


# Define the list of binary columns that contain 't' and 'f'
binary_cols = [
    "on_thyroxine", "query_on_thyroxine", "on_antithyroid_medication", 
    "sick", "pregnant", "thyroid_surgery", "I131_treatment", 
    "query_hypothyroid", "query_hyperthyroid", "lithium", "goitre", 
    "tumor", "hypopituitary", "psych"
]

# Apply the numerical encoding (0 and 1)
for col in binary_cols:
    # Assuming your DataFrame is named 'df'
    df[col] = df[col].replace({'t': 1, 'f': 0})
    


In [None]:
# --- INSERT THIS CODE HERE: After Binary Encoding Fix, replacing your original Cell 10 ---

# Encode sex and use MODE for imputation (Fix 2)
dataframe.sex.replace({'F': 2, 'M': 1}, inplace=True) 
mode_val = dataframe.sex.mode()[0] 
dataframe.sex.fillna(mode_val, inplace=True) 
dataframe.sex = dataframe.sex.astype(int)

# Drop Unnecessary Columns (Fix 3 - No TT4 drop, drop measured flags)
measured_cols = [
    'TSH measured', 'T3_measured', 
    'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured'
]
dataframe.drop(measured_cols, axis=1, inplace=True)

# Drop the 'TBG' value column as it has too many missing values (>95%)
dataframe.drop('TBG', axis=1, inplace=True)



In [None]:
# 1. Standardize Missing Values
# The initial loading in Cell 3 already handled '?' to NaN.

# 2. FIX 1: Correct Sex Imputation (Mode instead of Mean)
# Encode sex
dataframe['sex'] = dataframe['sex'].replace({'F': 2, 'M': 1})

# Calculate the MODE (most frequent value) for the sex column
# This is the statistically correct method for imputing a categorical feature.
mode_val = dataframe['sex'].mode()[0]

# Fill null values with the MODE
dataframe['sex'] = dataframe['sex'].fillna(mode_val)
# Ensure sex is integer type after imputation
dataframe['sex'] = dataframe['sex'].astype(int)


# 3. Drop Unnecessary Columns (Combined & Corrected Naming)
# The 'measured' columns and the 'TBG' column were already dropped in cell '30505ecf'.
# This section of code is therefore redundant and was causing the KeyError.
# It has been removed to prevent re-attempting to drop non-existent columns.

# --- CONTINUE TO CELL 11 (Heatmap) ---

In [None]:
dataframe.isnull().sum() #checking if any null value is present

In [None]:
dataframe

In [None]:
dataframe.isnull().sum()

In [None]:
from sklearn.impute import KNNImputer #importing the KNNInputer function from the sklearn.impute to fill the null values
knnimp = KNNImputer(n_neighbors=3) #making an instance of the KNN Inputer with neighbors=3

In [None]:
cols = ['TSH','T3','TT4','T4U','FTI'] #strogin the empty columns into the cols variables
for i in cols:
    dataframe[i] = knnimp.fit_transform(dataframe[[i]]) #here we are using the fit_transform function to fit the dataframe and filling the null values of it

In [None]:
dataframe.isnull().sum() # now we can see there is no null values

In [None]:
df2 = df.drop('target',axis=1) #making our x dataset by dropping our target column
y = df.target #storing our target column into y column

In [None]:
df2

In [None]:
plt.figure(figsize=(20,20)) #plotting the heatmap of size 20 cross 20
sns.heatmap(df2.corr(),annot=True) #plotting the heatmap of correlation using the seaborn library

In [None]:
from sklearn.model_selection import train_test_split #importing the train test split function from model selection of skelarn
X_train,X_test,y_train,y_test = train_test_split(df2,y,test_size=0.33,random_state=42) #dividing the dataset into training and testing dataset

# --- CODE TO INSERT IMMEDIATELY AFTER train_test_split IN CELL 60 ---

from sklearn.preprocessing import StandardScaler 

# Identify the numerical columns that need scaling.
numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

# Initialize the Scaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data.
scaler.fit(X_train[numerical_cols])

# Transform both the training and testing data.
X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [None]:
# --- INSERT THIS CODE IMMEDIATELY AFTER train_test_split IN CELL 60 ---

from sklearn.preprocessing import StandardScaler 

# Identify the numerical columns that need scaling.
# These are the columns with large, continuous ranges.
numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

# Initialize the Scaler
scaler = StandardScaler()

# 1. Fit the scaler ONLY on the training data.
scaler.fit(X_train[numerical_cols])

# 2. Transform both the training and testing data.
# This prevents data leakage and standardizes the features (mean=0, std=1).
X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# --- NOW CONTINUE TO K-NN TRAINING (CELL 65) ---

In [None]:
# Assuming X_train, X_test, y_train, y_test were just created by the line above.

from sklearn.preprocessing import StandardScaler 

# 1. Define numerical columns to be scaled
# These are the lab results and age which have large, varying scales.
numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

# 2. Initialize the Scaler
scaler = StandardScaler()

# 3. Fit the scaler ONLY on the training data and transform both sets
# This prevents data leakage and is essential for K-NN.
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# --- NOW PROCEED TO THE NEXT CELL (Likely Decision Tree or K-NN training) ---

# Model Selection

In [None]:
from sklearn.metrics import accuracy_score#importing the accuracy score from the sklearn metrics

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier #importing the descision tree classifier from the sklearn tree 
tree = DecisionTreeClassifier(max_depth=3) #making an instance the descision tree with maxdepth = 3 as passing the input
clf = tree.fit(X_train,y_train) #here we are passing our training and the testing data to the tree and fitting it
y_pred = clf.predict(X_test) #predicting the value by passing the x_test datset to the tree 
accuracy_score(y_pred,y_test)# here we are printing the accuracy score of the prediction and the testing data

# K-NN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier #importing the k nearest classifier from the sklearn neighbors 
neigh = KNeighborsClassifier(n_neighbors=3) #making an instance the k nearest neighbors with neighbors = 3 as passing the input
knnclf = neigh.fit(X_train,y_train) #here we are passing our training and the testing data to the tree and fitting it
y_pred = knnclf.predict(X_test) #predicting the value by passing the x_test datset to the tree 
accuracy_score(y_pred,y_test)# here we are printing the accuracy score of the prediction and the testing data