In [8]:
import pandas as pd
import numpy as np

import seaborn as sns ;sns.set(rc={'figure.figsize':(20,10)})

import matplotlib.pyplot as plt
plt.rcParams ['figure.figsize']  = 25, 10
plt.rcParams ['axes.labelsize']  = 14
plt.rcParams ['xtick.labelsize'] = 12
plt.rcParams ['ytick.labelsize'] = 12
plt.rcParams ['text.color']      = 'black'
plt.rcParams ['axes.labelcolor'] = 'black'
plt.rcParams ['xtick.color']     = 'black'
plt.rcParams ['ytick.color']     = 'black'
plt.style.use('fivethirtyeight')


import warnings
warnings.filterwarnings("ignore")


pd.options.display.float_format = '{:,.5f}'.format
pd.options.display.max_rows = 150
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 0


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Explore datasets

In [131]:
kidney_dataset = pd.read_csv('../data/kidney_disease/kidney_disease.csv')

In [132]:
kidney_dataset.sample(5)

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
286,286,71.0,70.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,81.0,18.0,0.8,145.0,5.0,14.7,44,9800,6.0,no,no,no,good,no,no,notckd
360,360,35.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,105.0,39.0,0.5,135.0,3.9,14.7,43,5800,6.2,no,no,no,good,no,no,notckd
276,276,20.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,,,,137.0,4.7,14.0,41,4500,5.5,no,no,no,good,no,no,notckd
190,190,6.0,60.0,1.01,4.0,0.0,abnormal,abnormal,notpresent,present,94.0,67.0,1.0,135.0,4.9,9.9,30,16700,4.8,no,no,no,poor,no,no,ckd


In [133]:
kidney_dataset.describe()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,400.0,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,199.5,51.48338,76.46907,1.01741,1.01695,0.45014,148.03652,57.42572,3.07245,137.52875,4.62724,12.52644
std,115.6143,17.16971,13.68364,0.00572,1.35268,1.09919,79.28171,50.50301,5.74113,10.40875,3.1939,2.91259
min,0.0,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,99.75,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,299.25,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [136]:
def clean_noisy_data(dataset,classes = 2):
    
    if 'id' in dataset.columns :
        dataset = dataset.drop(columns = ['id']) # Drop id column as it's not relevent for predictions

    #Changing numerical data into float types except for the output variable
    string_columns = []
    for c in dataset.columns : 
        try : 
            dataset[c] = dataset[c].astype(float)
        except ValueError :
            string_columns.append(c)
            
    for c in string_columns :
        dataset[c] = dataset[c].str.replace('\t','')
        dataset[c] = dataset[c].replace('?',np.nan)
    
    #Ordinal encoding of the output variable 
    output_column = dataset.columns[-1]
    outputs = dataset[output_column].unique()
    assert len(outputs) == classes, 'Error the number of output classes should be the same as the one in the dataset'
    dataset[output_column] = 1*(dataset[output_column] == outputs[0]) #return an ordinal encoding of the output variable
    
    return dataset

In [138]:
clean_noisy_data(kidney_dataset).sample(5)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
258,42.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,98.0,20.0,0.5,140.0,3.5,13.9,44,8400.0,5.5,no,no,no,good,no,no,0
288,56.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,70.0,46.0,1.2,135.0,4.9,15.9,50,11000.0,5.1,,,,good,no,no,0
124,65.0,100.0,1.015,0.0,0.0,,normal,notpresent,notpresent,90.0,98.0,2.5,,,9.1,28,5500.0,3.6,yes,no,no,good,no,no,1
177,65.0,80.0,1.015,2.0,1.0,normal,normal,present,notpresent,215.0,133.0,2.5,,,13.2,41,,,no,yes,no,good,no,no,1
341,63.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,130.0,37.0,0.9,150.0,5.0,13.4,41,7300.0,4.7,no,no,no,good,no,no,0


In [6]:
bank_dataset = pd.read_csv('../data/banknote/data_banknote_authentication.txt',sep=',')

In [7]:
bank_dataset.sample(5)

Unnamed: 0,variance,skewness,curtosis,entropy,class
1341,-2.2625,-0.09934,2.8127,0.48662,1
54,4.9264,5.496,-2.4774,-0.50648,0
693,1.5478,9.1814,-1.6326,-1.7375,0
752,0.38478,6.5989,-0.3336,-0.56466,0
210,4.2899,9.1814,-4.6067,-4.3263,0


In [139]:
clean_noisy_data(bank_dataset)

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.62160,8.66610,-2.80730,-0.44699,1
1,4.54590,8.16740,-2.45860,-1.46210,1
2,3.86600,-2.63830,1.92420,0.10645,1
3,3.45660,9.52280,-4.01120,-3.59440,1
4,0.32924,-4.45520,4.57180,-0.98880,1
...,...,...,...,...,...
1367,0.40614,1.34920,-1.45010,-0.55949,0
1368,-1.38870,-4.87730,6.47740,0.34179,0
1369,-3.75030,-13.45860,17.59320,-2.77710,0
1370,-3.56370,-8.38270,12.39300,-1.28230,0
