In [1]:
import numpy as np
import pandas as pd
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, BayesianEstimator, ExpectationMaximization

In [2]:
dfDict = {
    'age': [], 'bp': [], 'sg': [], 'al': [], 'su': [],
    'rbc': [], 'pc': [], 'pcc': [], 'ba': [], 'bgr': [],
    'bu': [], 'sc': [], 'sod': [], 'pot': [], 'hemo': [],
    'pcv': [], 'wbcc': [], 'rbcc': [], 'htn': [], 'dm': [],
    'cad': [], 'appet': [], 'pe': [], 'ane': [], 'class': []
}
mapIndexToKey = dict(zip(np.arange(25), dfDict.keys()))
with open("dataset/chronic_kidney_disease.arff", "r") as f:
    for line in f:
        if line[0] == '@':
            continue
        line = line.strip()
        if line == '':
            continue
        line = line.replace('\t', '').split(',')
        index = 0
        for item in line:
            if item == '':
                continue
            if item == '?':
                dfDict[mapIndexToKey[index]].append(np.nan)
            elif index == 21:
                dfDict[mapIndexToKey[index]].append(int(item == 'good'))
            elif index == 24:
                dfDict[mapIndexToKey[index]].append(int(item == 'ckd'))
            elif index in [5, 6]:
                dfDict[mapIndexToKey[index]].append(int(item == 'normal'))
            elif index in [7, 8]:
                dfDict[mapIndexToKey[index]].append(int(item == 'present'))
            elif index in [18, 19, 20, 22, 23]:
                dfDict[mapIndexToKey[index]].append(int(item == 'yes'))
            else:
                dfDict[mapIndexToKey[index]].append(float(item))
            index += 1
df = pd.DataFrame(dfDict)
df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1.0,0.0,,1.0,0.0,0.0,121.0,...,44.0,7800.0,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1
1,7.0,50.0,1.020,4.0,0.0,,1.0,0.0,0.0,,...,38.0,6000.0,,0.0,0.0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,1.010,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31.0,7500.0,,0.0,1.0,0.0,0.0,0.0,1.0,1
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,1.010,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,140.0,...,47.0,6700.0,4.9,0.0,0.0,0.0,1.0,0.0,0.0,0
396,42.0,70.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,75.0,...,54.0,7800.0,6.2,0.0,0.0,0.0,1.0,0.0,0.0,0
397,12.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,100.0,...,49.0,6600.0,5.4,0.0,0.0,0.0,1.0,0.0,0.0,0
398,17.0,60.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,114.0,...,51.0,7200.0,5.9,0.0,0.0,0.0,1.0,0.0,0.0,0


In [3]:
cleanedDf = df[['age', 'bp', 'su', 'rbc', 'bgr', 'sod', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']]
# Rename 'class' column to 'ckd' to match the edge definitions
cleanedDf = cleanedDf.rename(columns={'class': 'ckd'})
cleanedDf

Unnamed: 0,age,bp,su,rbc,bgr,sod,htn,dm,cad,appet,pe,ane,ckd
0,48.0,80.0,0.0,,121.0,,1.0,1.0,0.0,1.0,0.0,0.0,1
1,7.0,50.0,0.0,,,,0.0,0.0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,3.0,1.0,423.0,,0.0,1.0,0.0,0.0,0.0,1.0,1
3,48.0,70.0,0.0,1.0,117.0,111.0,1.0,0.0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,0.0,1.0,106.0,,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,0.0,1.0,140.0,150.0,0.0,0.0,0.0,1.0,0.0,0.0,0
396,42.0,70.0,0.0,1.0,75.0,141.0,0.0,0.0,0.0,1.0,0.0,0.0,0
397,12.0,80.0,0.0,1.0,100.0,137.0,0.0,0.0,0.0,1.0,0.0,0.0,0
398,17.0,60.0,0.0,1.0,114.0,135.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [4]:
edges = [
    ('ckd', 'sod'),
    ('ckd', 'bp'),
    ('ckd', 'rbc'),
    ('appet', 'ckd'),
    ('age', 'ckd'),
    ('dm', 'ckd'),
    ('htn', 'ckd'),
    ('ane', 'ckd'),
    ('rbc', 'ane'),
    ('sod', 'htn'),
    ('bp', 'htn'),
    ('appet', 'htn'),
    ('age', 'htn'),
    ('htn', 'pe'),
    ('htn', 'cad'),
    ('dm', 'cad'),
    ('appet', 'cad'),
    ('age', 'cad'),
    ('su', 'dm'),
    ('bgr', 'dm'),
    ('appet', 'dm'),
    ('age', 'dm')
]
edges

[('ckd', 'sod'),
 ('ckd', 'bp'),
 ('ckd', 'rbc'),
 ('appet', 'ckd'),
 ('age', 'ckd'),
 ('dm', 'ckd'),
 ('htn', 'ckd'),
 ('ane', 'ckd'),
 ('rbc', 'ane'),
 ('sod', 'htn'),
 ('bp', 'htn'),
 ('appet', 'htn'),
 ('age', 'htn'),
 ('htn', 'pe'),
 ('htn', 'cad'),
 ('dm', 'cad'),
 ('appet', 'cad'),
 ('age', 'cad'),
 ('su', 'dm'),
 ('bgr', 'dm'),
 ('appet', 'dm'),
 ('age', 'dm')]

In [5]:
def score_function(method='bic-d'):
    hillClimb = HillClimbSearch(
        cleanedDf
    )
    return hillClimb.estimate(scoring_method = method)

In [6]:
hc = HillClimbSearch(
    cleanedDf
)
hillClimb = score_function(method='bic-d')
hillClimbedEdges = hillClimb.edges
hillClimbNodes = hillClimb.nodes

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [7]:
hillClimbedEdges

OutEdgeView([('rbc', 'age'), ('rbc', 'bgr'), ('rbc', 'bp'), ('rbc', 'sod'), ('rbc', 'ckd'), ('rbc', 'dm'), ('rbc', 'htn'), ('rbc', 'appet'), ('rbc', 'su'), ('rbc', 'pe'), ('rbc', 'ane'), ('rbc', 'cad'), ('htn', 'ckd'), ('dm', 'su'), ('cad', 'htn'), ('ckd', 'dm'), ('ckd', 'appet'), ('ckd', 'pe'), ('ckd', 'ane')])

In [8]:
model = DiscreteBayesianNetwork(hillClimbedEdges)

In [9]:
em = ExpectationMaximization(model, cleanedDf)
em.model.fit(cleanedDf)

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}


<pgmpy.models.DiscreteBayesianNetwork.DiscreteBayesianNetwork at 0x22f696b6ad0>

In [11]:
def get_EM(X_train, y_train):
    hillClimbedEdges = hillClimb.edges
    X_train['ckd'] = y_train
    model = DiscreteBayesianNetwork(hillClimbedEdges)
    em = ExpectationMaximization(model, X_train)
    em.model.fit(X_train)
    return em

In [12]:
from sklearn.model_selection import train_test_split
X = cleanedDf[['age', 'bp', 'su', 'rbc', 'bgr', 'sod', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']]
y = cleanedDf['ckd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [18]:
new_em = get_EM(X_train, y_train)
new_em_model = new_em.model

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}


In [None]:
print(new_em_model.get_cpds('ckd'))

+--------+----------+---------------------+----------+----------+
| htn    | htn(0.0) | htn(0.0)            | htn(1.0) | htn(1.0) |
+--------+----------+---------------------+----------+----------+
| rbc    | rbc(0.0) | rbc(1.0)            | rbc(0.0) | rbc(1.0) |
+--------+----------+---------------------+----------+----------+
| ckd(0) | 0.0      | 0.8899082568807339  | 0.0      | 0.0      |
+--------+----------+---------------------+----------+----------+
| ckd(1) | 1.0      | 0.11009174311926606 | 1.0      | 1.0      |
+--------+----------+---------------------+----------+----------+


In [None]:
y_pred = new_em_model.predict(X_test)

AttributeError: 'list' object has no attribute 'columns'