In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
dir_path_post = '../cleaned_processed_data'
df = pd.read_csv(os.path.join(dir_path_post,'stage01_cleaned_data.csv'))
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,negative
1,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,negative
2,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,negative
3,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,negative
4,18,F,t,f,f,f,f,f,f,f,...,t,183,t,1.3,t,141,f,?,other,negative


# unique values of the data set

In [3]:
for col in df.columns:
    print(col)
    print("*****************************")
    print(df[col].value_counts())

age
*****************************
?        893
60       446
59       436
62       420
72       406
        ... 
65511      1
65526      1
9.0        1
6.0        1
65512      1
Name: age, Length: 200, dtype: int64
sex
*****************************
F    18209
M     8017
?      620
Name: sex, dtype: int64
on_thyroxine
*****************************
f    17011
0     6259
t     2637
1      939
Name: on_thyroxine, dtype: int64
query_on_thyroxine
*****************************
f    19331
0     7087
t      317
1      111
Name: query_on_thyroxine, dtype: int64
on_antithyroid_medication
*****************************
f    19401
0     7106
t      247
1       92
Name: on_antithyroid_medication, dtype: int64
sick
*****************************
f    18944
0     6922
t      704
1      276
Name: sick, dtype: int64
pregnant
*****************************
f    19354
0     7120
t      294
1       78
Name: pregnant, dtype: int64
thyroid_surgery
*****************************
f    19251
0     7097
t      397
1 

In [4]:
## there are "?" entries which denotes missingvalues or nan values replacing them
df=df.replace({"?":np.NAN})
df.isna().sum()

age                            893
sex                            620
on_thyroxine                     0
query_on_thyroxine               0
on_antithyroid_medication        0
sick                             0
pregnant                         0
thyroid_surgery                  0
I131_treatment                6324
query_hypothyroid                0
query_hyperthyroid               0
lithium                          0
goitre                           0
tumor                            0
hypopituitary                 6324
psych                         6324
TSH_measured                     0
TSH                           2150
T3_measured                      0
T3                            4812
TT4_measured                     0
TT4                           1166
T4U_measured                     0
T4U                           1708
FTI_measured                     0
FTI                           1697
TBG_measured                  7198
TBG                          25985
referral_source     

In [5]:
## Dropping the rows with more than 10 nan values and columns with more than 7000 nans 
def dropna_thresh(dataframe):
    dataframe.dropna(thresh=20, inplace =True)
    column_drop = []
    for col in dataframe.columns:
        if dataframe[col].isna().sum() > 7000:
            column_drop.append(col)
    dataframe = dataframe.drop(columns = column_drop)
    return dataframe
df = dropna_thresh(df)
df.isna().sum()

age                           889
sex                           616
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
I131_treatment               6320
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                6320
psych                        6320
TSH_measured                    0
TSH                          2146
T3_measured                     0
T3                           4808
TT4_measured                    0
TT4                          1162
T4U_measured                    0
T4U                          1704
FTI_measured                    0
FTI                          1693
Class                           0
dtype: int64

In [6]:
### Encoding into numerical values 
def encode(dataframe):
    """ 
    1. sex column has 'F' and 'M' entries
    2. on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant
thyroid_surgery,I131_treatment,query_hypothyroid,lithium,goitre,tumor,hypopituitary
psych has 'f' and 't entries'
3. TSH_measured,T3_measured,TT4_measured,T4U_measured,FTI_measured has 'f','n','t','y' entries
    """
    dataframe['sex'] = dataframe['sex'].replace({'F':1, 'M':0})
    dataframe = dataframe.replace({"t":1,"f":0})
    dataframe = dataframe.replace({"y":1, "n":0})
    dataframe['Class'] = dataframe['Class'].replace({"negative":0,"hypothyroid":1,"hyperthyroid":2,"sick-euthyroid":3})
    for col in dataframe.columns:
        try:
            dataframe[col] = dataframe[col].apply(pd.to_numeric , errors = 'coerce')
        except:
            pass
    return dataframe
    

In [7]:
df = encode(df)
df.dtypes


age                          float64
sex                          float64
on_thyroxine                   int64
query_on_thyroxine             int64
on_antithyroid_medication      int64
sick                           int64
pregnant                       int64
thyroid_surgery                int64
I131_treatment               float64
query_hypothyroid              int64
query_hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                float64
psych                        float64
TSH_measured                   int64
TSH                          float64
T3_measured                    int64
T3                           float64
TT4_measured                   int64
TT4                          float64
T4U_measured                   int64
T4U                          float64
FTI_measured                   int64
FTI                          float64
Class                          int64
d

In [8]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,Class
0,23.0,1.0,0,0,0,0,0,0,0.0,0,...,4.1,1,2.0,1,102.0,0,,0,,0
1,46.0,0.0,0,0,0,0,0,0,0.0,0,...,0.98,0,,1,109.0,1,0.91,1,120.0,0
2,70.0,1.0,1,0,0,0,0,0,0.0,0,...,0.16,1,1.9,1,175.0,0,,0,,0
3,70.0,1.0,0,0,0,0,0,0,0.0,0,...,0.72,1,1.2,1,61.0,1,0.87,1,70.0,0
4,18.0,1.0,1,0,0,0,0,0,0.0,0,...,0.03,0,,1,183.0,1,1.3,1,141.0,0


In [9]:
display(df.shape)
df['Class'].value_counts()

(26842, 27)

0    24699
1     1401
2      450
3      292
Name: Class, dtype: int64

#### it seems like data is highly imbalanced . if its true then we cannot take this data forward without imputing the nan 


In [10]:
data1 = df.interpolate(method = 'spline', order = 3)
display(data1.isna().sum())

age                          0
sex                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
sick                         0
pregnant                     0
thyroid_surgery              0
I131_treatment               0
query_hypothyroid            0
query_hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH_measured                 0
TSH                          0
T3_measured                  0
T3                           0
TT4_measured                 0
TT4                          0
T4U_measured                 0
T4U                          1
FTI_measured                 0
FTI                          1
Class                        0
dtype: int64

In [11]:
data1.dropna(inplace =True)

In [12]:
data1.to_csv(os.path.join(dir_path_post,'stage02_preprocessed.csv'), index = False)