# Data Preprocessing

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn import set_config; set_config(display='diagram')

## get an insight:

In [3]:
legend_list = ['Age(numerical) : age in years', 
               'Blood Pressure(numerical): bp in mm/Hg',
               'Specific Gravity(nominal): sg - (1.005,1.010,1.015,1.020,1.025)',
               'Albumin(nominal): al - (0,1,2,3,4,5)',
               'Sugar(nominal): su - (0,1,2,3,4,5)',
               'Red Blood Cells(nominal): rbc - (normal,abnormal)',
               'Pus Cell (nominal): pc - (normal,abnormal)',
               'Pus Cell clumps(nominal): pcc - (present,notpresent)',
               'Bacteria(nominal): ba - (present,notpresent)',
               'Blood Glucose Random(numerical): bgr in mgs/dl',
               'Blood Urea(numerical): bu in mgs/dl',
               'Serum Creatinine(numerical): sc in mgs/dl',
               'Sodium(numerical): sod in mEq/L',
               'Potassium(numerical) pot in mEq/L',
               'Hemoglobin(numerical) hemo in gms',
               'Packed Cell Volume(numerical)',
               'White Blood Cell Count(numerical): wc in cells/cumm',
               'Red Blood Cell Count(numerical): rc in millions/cmm',
               'Hypertension(nominal) htn : (yes,no)',
               'Diabetes Mellitus(nominal): dm - (yes,no)',
               'Coronary Artery Disease(nominal): cad - (yes,no)',
               'Appetite(nominal): appet - (good,poor)',
               'Pedal Edema(nominal): pe - (yes,no)',
               'Anemia(nominal): ane - (yes,no)']

In [4]:
df=pd.read_csv('raw_data/kidney_disease.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'raw_data/kidney_disease.csv'

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
## no duplicates!!

df.duplicated().value_counts()

In [None]:
## defining X & y:

X = df.drop(columns = {'classification', 'id'})
y = df.classification

In [None]:
## encoding y:
y.value_counts()

In [None]:
#three categories in target: order the ckd/t to 1, assuming it means to be announced
y= y.replace(to_replace={'ckd':1,'notckd':0, 'ckd\t': 1}).astype(int)
y.info()

In [None]:
X.wc.unique()

In [None]:
X.info()

In [None]:
## get an understanding about NANs & their meaning:

X_null_legend = pd.DataFrame(X.isnull().sum())
X_null_legend['Legend'] = legend_list
X_null_legend['type'] = pd.DataFrame(X.dtypes)
X_null_legend

## Numerical Feautures

In [None]:
## visualisation of numerical features:

def X_num_visualiation(X):
    X_num = X.select_dtypes(exclude = ['object'])
    for num_feature in X_num.columns:

        fig, ax = plt.subplots(1,3, figsize = (15,5))

        ax[0].set_title(f"Distribution of {num_feature}")
        sns.histplot(x = df[num_feature], kde = True, ax = ax[0])

        ax[1].set_title(f"Boxplot of {num_feature}")
        sns.boxplot(x = df[num_feature], ax = ax[1])

        ax[2].set_title(f"QQplot of {num_feature}")
        qqplot(df[num_feature], line='s', ax = ax[2])


In [None]:
X_num_visualiation(X)

In [None]:
def replacing_numerical_features(X):
    '''cleaning: strips \t at beginning of number and replaces ? with nan values'''
    X['pcv'] = X['pcv'].str.lstrip('\t')
    X['pcv'] = X['pcv'].replace(to_replace='?',value=np.nan).astype(float)
    X['wc'] = X['wc'].str.lstrip('\t')
    X['wc'] = X['wc'].replace(to_replace='?',value=np.nan).astype(float)
    X['rc'] = X['rc'].str.lstrip('\t')
    X['rc'] = X['rc'].replace(to_replace='?',value=np.nan).astype(float)
    return X

replacing_numerical_features(X)

## Categorical Features

In [None]:
def replacing_binary_features(X):
    '''encoding: replacing Yes --> 1 no --> 0'''
    X[['htn','dm','cad','pe','ane']] = X[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
    X[['rbc','pc']] = X[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
    X[['pcc','ba']] = X[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
    X[['appet']] = X[['appet']].replace(to_replace={'good':2,'poor':1,'no':0})
    ## replacing t_values to 0 or 1, by assuming it s close to 0 or 1, respectively
    X['cad'] = X['cad'].replace(to_replace='\tno',value=0)
    X['dm'] = X['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1})
    return X

In [None]:
replacing_binary_features(X)

## Going for the pipeline

In [None]:
X_null_legend

In [None]:
X.nunique()

In [7]:
# creating feat_lists for pipeline:

feat_binary = X.columns[X.nunique()==2]
feat_ordered = ['sg', 'al', 'su']
feat_continuous = X.columns[X.nunique()>6]

NameError: name 'X' is not defined

In [None]:
'''imputing: using most frequent value for ordinal columns, because we have fetaures with a lot of nan values'''
ordered_transformer = Pipeline([
                            ('cat_imputer', SimpleImputer(strategy='most_frequent')),
                            ('mm_scaler', MinMaxScaler())
                            ])

binary_transformer = Pipeline([
                            ('cat_imputer', SimpleImputer(strategy='most_frequent'))
                            ])

cont_transformer = Pipeline([
                            ('num_imputer', SimpleImputer()),
                            ('mm_scaler', MinMaxScaler())
                            ])

preproc_pipe = ColumnTransformer([
                                    ('ord_trans', ordered_transformer, feat_ordered),
                                    ('bin_trans', binary_transformer, feat_binary),
                                    ('cont_trans', cont_transformer, feat_continuous)
])

In [6]:
preproc_pipe

NameError: name 'preproc_pipe' is not defined

In [5]:
X_preproc = preproc_pipe.fit_transform(X)

NameError: name 'preproc_pipe' is not defined

In [4]:
X_preproc_df = pd.DataFrame(X_preproc)
X_preproc_df[0]

NameError: name 'X_preproc' is not defined

In [None]:
X_preproc_df.isnull().sum()

In [None]:
np.linalg.matrix_rank(X_preproc) == X_preproc.shape[1]