In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer ,SimpleImputer
from sklearn.ensemble import  ExtraTreesRegressor
from scipy.stats import zscore
from imblearn.combine import SMOTETomek
from sklearn.compose import ColumnTransformer
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split

In [2]:
names = ["age","sex","on_thyroxine","query_on_thyroxine","on_antihyroid_meds","sick","pregnant","thyroid_surgery","I131_treatment","query_hypothyroid","query hyperthyroid","lithium","goitre","tumor","hypopituitary","psych","TSH_measured","TSH","T3_measured","T3","TT4_measured","TT4","T4U_measured","T4U","FTI_measured","FTI","TBG_measured","TBG","referral_source","target"]
df = pd.read_csv("data/thyroid0387.data",names=names)
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antihyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target
0,29,F,f,f,f,f,f,f,f,t,...,f,?,f,?,f,?,f,?,other,-[840801013]
1,29,F,f,f,f,f,f,f,f,f,...,t,128,f,?,f,?,f,?,other,-[840801014]
2,41,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,11,other,-[840801042]
3,36,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,26,other,-[840803046]
4,32,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,36,other,S[840803047]


In [3]:
# tidy the target column
df['patient_id'] = df["target"].apply(lambda x: x.split("[")[1].strip(']'))
df['target'] = df["target"].apply(lambda x: x.split("[")[0])

In [4]:
# replacing ? with np.nan
df.replace({"?":np.nan},inplace=True)

In [5]:
# converting object to float
num_cols = ["TSH","T3","TT4","T4U","FTI","TBG"]
for i in num_cols:
    df[i] = df[i].astype(float)

In [6]:
# age cannot be 65526 
# capping age to 100 years
df = df[df["age"] <= 100]

In [7]:
# Remove reduntant columns
df.drop(['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','referral_source','patient_id',"TBG"],axis=1, inplace=True)

In [8]:
# Selecting a subset of target which can be classified as Hyper , hypo or Euthyroid (Negative) state
df = df[df['target'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'AK', 'C|I', 'H|K', 'GK', 'FK', 'GI', 'GKJ', 'D|R', '-'])]
# mapping the target column
mapping = {'-':"Negative",
           'A':'Hyperthyroid','AK':"Hyperthyroid",'B':"Hyperthyroid", 'C':"Hyperthyroid", 'C|I': 'Hyperthyroid', 'D':"Hyperthyroid", 'D|R':"Hyperthyroid",
           'E': "Hypothyroid", 'F': "Hypothyroid", 'FK': "Hypothyroid", "G": "Hypothyroid", "GK": "Hypothyroid", "GI": "Hypothyroid", 'GKJ': 'Hypothyroid', 'H|K': 'Hypothyroid',
          }
df['target'] = df['target'].map(mapping)

In [9]:
# impute some missing values of sex (total = 254 missing) using pregnancy
df["sex"] = np.where((df["sex"].isnull()) & (df["pregnant"]=="t"),'F',df["sex"])

In [10]:
# EDA 
cat_cols = df.select_dtypes(include="object").columns
cat_cols =  cat_cols.drop("target")
num_cols = df.select_dtypes(exclude="object").columns
print(cat_cols)
print(num_cols)

Index(['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antihyroid_meds',
       'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment',
       'query_hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor',
       'hypopituitary', 'psych'],
      dtype='object')
Index(['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI'], dtype='object')


## **Outliers are not removed as they may be important for the model**

In [11]:
df = df.replace({"t":1,"f":0})


In [12]:
# Mapping sex to 0 for female and 1 for male
df["sex"] = df["sex"].map({"F":0,"M":1})

In [13]:
X = df.drop("target",axis=1)
y = df["target"]

In [16]:
resampler = SMOTETomek(random_state=42)
X_res , y_res = resampler.fit_resample(X,y)

ValueError: Input X contains NaN.
SMOTETomek does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
## Numerical Pipeline
num_pipeline=Pipeline(
steps=[
    ('imputer',IterativeImputer(estimator=ExtraTreesRegressor()),
    ('scaler', StandardScaler()),
    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # ('ordinalencoder',OneHotEncoder(drop='first',handle_unknown='ignore')),
    # ('scaler',StandardScaler(with_mean=False))
    ]

)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)
])