In [337]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
from termcolor import colored

In [338]:
import pandas as pd

# Load the dataset to inspect it
path = 'data/ckd-dataset-v2.csv'
df = pd.read_csv(path)

df.head()


Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,...,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete
1,,,,,,,,,,,...,,,,,,,,,class,meta
2,0,0,1.019 - 1.021,1 - 1,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
3,0,0,1.009 - 1.011,< 0,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
4,0,0,1.009 - 1.011,≥ 4,ckd,1,< 0,1,0,1,...,0,0,0,1,0,0,127.281 - 152.446,s1,1,< 12


In [339]:
# drop columns with all missing values
df = df.dropna(how='all')

In [340]:
# first let's drop the first two rows that are immediately visible to be useless
df = df.iloc[2:].reset_index(drop=True)

In [341]:
# inspct row 1
df.iloc[1]

bp (Diastolic)                0
bp limit                      0
sg                1.009 - 1.011
al                          < 0
class                       ckd
rbc                           0
su                          < 0
pc                            0
pcc                           0
ba                            0
bgr                   112 - 154
bu                       < 48.1
sod                   133 - 138
sc                       < 3.65
pot                      < 7.31
hemo                11.3 - 12.6
pcv                 33.5 - 37.4
rbcc                4.46 - 5.05
wbcc              12120 - 14500
htn                           0
dm                            0
cad                           0
appet                         0
pe                            0
ane                           0
grf                   ≥ 227.944
stage                        s1
affected                      1
age                        < 12
Name: 1, dtype: object

In [342]:
df.head()

Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0,0,1.019 - 1.021,1 - 1,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
1,0,0,1.009 - 1.011,< 0,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
2,0,0,1.009 - 1.011,≥ 4,ckd,1,< 0,1,0,1,...,0,0,0,1,0,0,127.281 - 152.446,s1,1,< 12
3,1,1,1.009 - 1.011,3 - 3,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,127.281 - 152.446,s1,1,< 12
4,0,0,1.015 - 1.017,< 0,ckd,0,< 0,0,0,0,...,0,1,0,1,1,0,127.281 - 152.446,s1,1,12 - 20


In [343]:
# find nan values
df.isnull().sum()

bp (Diastolic)    0
bp limit          0
sg                0
al                0
class             0
rbc               0
su                0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sod               0
sc                0
pot               0
hemo              0
pcv               0
rbcc              0
wbcc              0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
grf               0
stage             0
affected          0
age               0
dtype: int64

In [344]:
df.dtypes

bp (Diastolic)    object
bp limit          object
sg                object
al                object
class             object
rbc               object
su                object
pc                object
pcc               object
ba                object
bgr               object
bu                object
sod               object
sc                object
pot               object
hemo              object
pcv               object
rbcc              object
wbcc              object
htn               object
dm                object
cad               object
appet             object
pe                object
ane               object
grf               object
stage             object
affected          object
age               object
dtype: object

In [345]:
# output all values of the first row
df.loc[0]

bp (Diastolic)                0
bp limit                      0
sg                1.019 - 1.021
al                        1 - 1
class                       ckd
rbc                           0
su                          < 0
pc                            0
pcc                           0
ba                            0
bgr                       < 112
bu                       < 48.1
sod                   138 - 143
sc                       < 3.65
pot                      < 7.31
hemo                11.3 - 12.6
pcv                 33.5 - 37.4
rbcc                4.46 - 5.05
wbcc                7360 - 9740
htn                           0
dm                            0
cad                           0
appet                         0
pe                            0
ane                           0
grf                   ≥ 227.944
stage                        s1
affected                      1
age                        < 12
Name: 0, dtype: object

In [346]:
# rename columns with spaces in the name
df.rename(columns={'bp (Diastolic)': 'bp_diastolic'}, inplace=True)
df.rename(columns={'bp limit': 'bp_limit'}, inplace=True)

In [347]:
df.columns

Index(['bp_diastolic', 'bp_limit', 'sg', 'al', 'class', 'rbc', 'su', 'pc',
       'pcc', 'ba', 'bgr', 'bu', 'sod', 'sc', 'pot', 'hemo', 'pcv', 'rbcc',
       'wbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'grf', 'stage',
       'affected', 'age'],
      dtype='object')

In [348]:
# show count of unique values for each column
for col in df.columns:
    print(col, df[col].nunique())

bp_diastolic 2
bp_limit 3
sg 5
al 5
class 2
rbc 2
su 6
pc 2
pcc 2
ba 2
bgr 10
bu 8
sod 9
sc 7
pot 4
hemo 10
pcv 10
rbcc 9
wbcc 9
htn 2
dm 2
cad 2
appet 2
pe 2
ane 2
grf 11
stage 5
affected 2
age 10


In [349]:
# show unique values for each column (since we know there aren't any columns with overwhealmingly many unique values)
for col in df.columns:
    print(colored(col, 'green'), df[col].unique())

[32mbp_diastolic[0m ['0' '1']
[32mbp_limit[0m ['0' '1' '2']
[32msg[0m ['1.019 - 1.021' '1.009 - 1.011' '1.015 - 1.017' '≥ 1.023' '< 1.007']
[32mal[0m ['1 - 1' '< 0' '≥ 4' '3 - 3' '2 - 2']
[32mclass[0m ['ckd' 'notckd']
[32mrbc[0m ['0' '1']
[32msu[0m ['< 0' '4 - 4' '2 - 2' '3 - 4' '1 - 2' '≥ 4']
[32mpc[0m ['0' '1']
[32mpcc[0m ['0' '1']
[32mba[0m ['0' '1']
[32mbgr[0m ['< 112' '112 - 154' '154 - 196' '406 - 448' '238 - 280' '196 - 238'
 '≥ 448' '280 - 322' '364 - 406' '322 - 364']
[32mbu[0m ['< 48.1' '48.1 - 86.2' '200.5 - 238.6' '124.3 - 162.4' '86.2 - 124.3'
 '162.4 - 200.5' '≥ 352.9' '238.6 - 276.7']
[32msod[0m ['138 - 143' '133 - 138' '123 - 128' '143 - 148' '148 - 153' '< 118'
 '128 - 133' '118 - 123' '≥ 158']
[32msc[0m ['< 3.65' '3.65 - 6.8' '16.25 - 19.4' '6.8 - 9.95' '13.1 - 16.25'
 '9.95 - 13.1' '≥ 28.85']
[32mpot[0m ['< 7.31' '≥ 42.59' '7.31 - 11.72' '38.18 - 42.59']
[32mhemo[0m ['11.3 - 12.6' '8.7 - 10' '13.9 - 15.2' '≥ 16.5' '10 - 11.3' '7.4 - 8.

In [350]:
df_copy = df.copy()

In [351]:
# these columns already have valid numeric strings, so converting them to numeric first
to_numeric_1 = ['bp_diastolic', 'bp_limit', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad']
for col in to_numeric_1:
    df[col] = pd.to_numeric(df[col], errors='raise')

In [352]:
# turn class column into binary. cgk = 1, notckd = 0
df['class'] = df['class'].map({'ckd': 1, 'notckd': 0})
df['stage'] = df['stage'].map({'s1': 1, 's2': 2, 's3': 3, 's4': 4, 's5': 5})

In [353]:
def process_range(val):
    """This function processes the range values in the dataset and returns the average of the range.
    For < and ≥ values, it returns the lower and upper limits respectively.
    """
    try:
        # Try to convert directly to float (if it's already a valid number like '27')
        return float(val)
    except ValueError:
        # Handle ranges and comparisons
        if '-' in val:
            low, high = map(float, val.split('-'))
            return (low + high) / 2  # Take the midpoint of the range
        elif '<' in val:
            return float(val.replace('<', '').strip())  # Return the lower limit
        elif '≥' in val:
            return float(val.replace('≥', '').strip())  # Return the upper limit
        else:
            return np.nan  # Return NaN for anything unexpected

In [354]:
# process the rest of the columns
for col in df.columns:
    if col not in to_numeric_1 + ['class', 'stage']:
        df[col] = df[col].apply(process_range)

In [355]:
df.dtypes

bp_diastolic      int64
bp_limit          int64
sg              float64
al              float64
class             int64
rbc               int64
su              float64
pc                int64
pcc               int64
ba                int64
bgr             float64
bu              float64
sod             float64
sc              float64
pot             float64
hemo            float64
pcv             float64
rbcc            float64
wbcc            float64
htn               int64
dm                int64
cad               int64
appet           float64
pe              float64
ane             float64
grf             float64
stage             int64
affected        float64
age             float64
dtype: object

In [356]:
df.isnull().sum()

bp_diastolic    0
bp_limit        0
sg              0
al              0
class           0
rbc             0
su              0
pc              0
pcc             0
ba              0
bgr             0
bu              0
sod             0
sc              0
pot             0
hemo            0
pcv             0
rbcc            0
wbcc            0
htn             0
dm              0
cad             0
appet           0
pe              0
ane             0
grf             1
stage           0
affected        0
age             0
dtype: int64

In [357]:
df['grf'].unique()

array([227.944  , 139.8635 , 114.698  , 190.195  ,  39.20035,  64.3661 ,
        89.532  , 165.029  , 215.361  ,  26.6175 ,       nan])

In [358]:
# tried applying CDK-EPI formula to calculate GFR, but it didn't work out. (from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2866096/)
# The estimated values were way off
# instead let's use the mean value of the same class to fill in the missing values

# find row with NaN grf value
class_value = df[df['grf'].isnull()]['class']
print(f"class for missing grf: {class_value}")

# find median value of grf for the same class. Using median because we transformed ranges to single values
median_grf = df[df['class'] == 1]['grf'].median()
print(f"Median grf for class 1: {median_grf}")

# fill in the missing value if class == 1 and grf is NaN
df.loc[(df['class'] == 1) & (df['grf'].isnull()), 'grf'] = median_grf



class for missing grf: 179    1
Name: class, dtype: int64
Median grf for class 1: 26.6175


In [359]:
df.isnull().sum()

bp_diastolic    0
bp_limit        0
sg              0
al              0
class           0
rbc             0
su              0
pc              0
pcc             0
ba              0
bgr             0
bu              0
sod             0
sc              0
pot             0
hemo            0
pcv             0
rbcc            0
wbcc            0
htn             0
dm              0
cad             0
appet           0
pe              0
ane             0
grf             0
stage           0
affected        0
age             0
dtype: int64

In [360]:
df.head()

Unnamed: 0,bp_diastolic,bp_limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0,0,1.02,1.0,1,0,0.0,0,0,0,...,0,0,0,0.0,0.0,0.0,227.944,1,1.0,12.0
1,0,0,1.01,0.0,1,0,0.0,0,0,0,...,0,0,0,0.0,0.0,0.0,227.944,1,1.0,12.0
2,0,0,1.01,4.0,1,1,0.0,1,0,1,...,0,0,0,1.0,0.0,0.0,139.8635,1,1.0,12.0
3,1,1,1.01,3.0,1,0,0.0,0,0,0,...,0,0,0,0.0,0.0,0.0,139.8635,1,1.0,12.0
4,0,0,1.016,0.0,1,0,0.0,0,0,0,...,0,1,0,1.0,1.0,0.0,139.8635,1,1.0,16.0
