In [1]:
import numpy as np
import pandas as pd

In [2]:
header = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc','rc', 'htn', 'dm', 'cad', 'appet','pe','ane', 'class']

In [58]:
df = pd.read_csv('./Data/chronic_kidney_disease_full.arff', header=None, names = header)
df = df.dropna(axis=0, how='any') # if any of the row is na, remove

In [59]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
143,48,80,1.02,1,0,?,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
144,7,50,1.02,4,0,?,normal,notpresent,notpresent,?,...,38,6000,?,no,no,no,good,no,no,ckd
145,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,?,no,yes,no,poor,no,yes,ckd
146,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
147,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [60]:
df.shape

(399, 25)

All the raw data has been read in

In [61]:
for col in header:
    print(df[col].unique())

['48' '7' '62' '51' '60' '68' '24' '52' '53' '50' '63' '40' '47' '61' '21'
 '42' '75' '69' '?' '73' '70' '65' '76' '72' '82' '46' '45' '35' '54' '11'
 '59' '67' '15' '55' '44' '26' '64' '56' '5' '74' '38' '58' '71' '34' '17'
 '12' '43' '41' '57' '8' '39' '66' '81' '14' '27' '83' '30' '4' '3' '6'
 '32' '80' '49' '90' '78' '19' '2' '33' '36' '37' '23' '25' '20' '29' '28'
 '22' '79']
['80' '50' '70' '90' '?' '100' '60' '110' '140' '180' '120']
['1.020' '1.010' '1.005' '1.015' '?' '1.025']
['1' '4' '2' '3' '0' '?' '5']
['0' '3' '4' '1' '?' '2' '5']
['?' 'normal' 'abnormal']
['normal' 'abnormal' '?']
['notpresent' 'present' '?']
['notpresent' 'present' '?']
['121' '?' '423' '117' '106' '74' '100' '410' '138' '70' '490' '380' '208'
 '98' '157' '76' '99' '114' '263' '173' '95' '108' '156' '264' '123' '93'
 '107' '159' '140' '171' '270' '92' '137' '204' '79' '207' '124' '144'
 '91' '162' '246' '253' '141' '182' '86' '150' '146' '425' '112' '250'
 '360' '163' '129' '133' '102' '158' '165' '132'

Despite the fact that a question mark ? means a missing value as indicated by the dataset description, the values in dataset contain '\t' from time to time. My guess is that when people type in the data, some tend to type an extra tab which caused this problem. My following process will assume that '\t' doesn't mean anything and need to be removed from the data.

In [62]:
def remove_tab(element):
    if '\t' in element:
        return element.replace('\t','')
    else:
        return element

In [63]:
df = df.applymap(remove_tab)

In [64]:
# now check it again
for col in header:
    print(df[col].unique())

['48' '7' '62' '51' '60' '68' '24' '52' '53' '50' '63' '40' '47' '61' '21'
 '42' '75' '69' '?' '73' '70' '65' '76' '72' '82' '46' '45' '35' '54' '11'
 '59' '67' '15' '55' '44' '26' '64' '56' '5' '74' '38' '58' '71' '34' '17'
 '12' '43' '41' '57' '8' '39' '66' '81' '14' '27' '83' '30' '4' '3' '6'
 '32' '80' '49' '90' '78' '19' '2' '33' '36' '37' '23' '25' '20' '29' '28'
 '22' '79']
['80' '50' '70' '90' '?' '100' '60' '110' '140' '180' '120']
['1.020' '1.010' '1.005' '1.015' '?' '1.025']
['1' '4' '2' '3' '0' '?' '5']
['0' '3' '4' '1' '?' '2' '5']
['?' 'normal' 'abnormal']
['normal' 'abnormal' '?']
['notpresent' 'present' '?']
['notpresent' 'present' '?']
['121' '?' '423' '117' '106' '74' '100' '410' '138' '70' '490' '380' '208'
 '98' '157' '76' '99' '114' '263' '173' '95' '108' '156' '264' '123' '93'
 '107' '159' '140' '171' '270' '92' '137' '204' '79' '207' '124' '144'
 '91' '162' '246' '253' '141' '182' '86' '150' '146' '425' '112' '250'
 '360' '163' '129' '133' '102' '158' '165' '132'

In [65]:
df.dm.unique() # this ' yes' needs to be changed into 'yes'

array(['yes', 'no', ' yes', '?'], dtype=object)

In [66]:
def remove_space(element):
    if ' ' in element:
        return element.replace(' ', '')
    else:
        return element

In [67]:
df = df.applymap(remove_space)

In [68]:
df = df.replace('?', np.nan)

In [69]:
# now check it again
for col in header:
    print(df[col].unique())

['48' '7' '62' '51' '60' '68' '24' '52' '53' '50' '63' '40' '47' '61' '21'
 '42' '75' '69' nan '73' '70' '65' '76' '72' '82' '46' '45' '35' '54' '11'
 '59' '67' '15' '55' '44' '26' '64' '56' '5' '74' '38' '58' '71' '34' '17'
 '12' '43' '41' '57' '8' '39' '66' '81' '14' '27' '83' '30' '4' '3' '6'
 '32' '80' '49' '90' '78' '19' '2' '33' '36' '37' '23' '25' '20' '29' '28'
 '22' '79']
['80' '50' '70' '90' nan '100' '60' '110' '140' '180' '120']
['1.020' '1.010' '1.005' '1.015' nan '1.025']
['1' '4' '2' '3' '0' nan '5']
['0' '3' '4' '1' nan '2' '5']
[nan 'normal' 'abnormal']
['normal' 'abnormal' nan]
['notpresent' 'present' nan]
['notpresent' 'present' nan]
['121' nan '423' '117' '106' '74' '100' '410' '138' '70' '490' '380' '208'
 '98' '157' '76' '99' '114' '263' '173' '95' '108' '156' '264' '123' '93'
 '107' '159' '140' '171' '270' '92' '137' '204' '79' '207' '124' '144'
 '91' '162' '246' '253' '141' '182' '86' '150' '146' '425' '112' '250'
 '360' '163' '129' '133' '102' '158' '165' '132'

In [70]:
numerical_cols = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']

In [71]:
df[numerical_cols] = df[numerical_cols].astype('float64')

In [72]:
categorical_cols = ['sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']
df[categorical_cols] = df[categorical_cols].astype('category')

In [76]:
df.to_csv('./Data/chronic_kidney_disease_cleaned.csv')