In [1]:
import numpy as np
import pandas as pd
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, ExpectationMaximization
from sklearn.model_selection import train_test_split
from pgmpy.inference import VariableElimination
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dfDict = {
    'age': [], 'bp': [], 'sg': [], 'al': [], 'su': [],
    'rbc': [], 'pc': [], 'pcc': [], 'ba': [], 'bgr': [],
    'bu': [], 'sc': [], 'sod': [], 'pot': [], 'hemo': [],
    'pcv': [], 'wbcc': [], 'rbcc': [], 'htn': [], 'dm': [],
    'cad': [], 'appet': [], 'pe': [], 'ane': [], 'ckd': []
}
mapIndexToKey = dict(zip(np.arange(25), dfDict.keys()))
with open("dataset/chronic_kidney_disease.arff", "r") as f:
    for line in f:
        if line[0] == '@':
            continue
        line = line.strip()
        if line == '':
            continue
        line = line.replace('\t', '').split(',')
        index = 0
        for item in line:
            if item == '':
                continue
            if item == '?':
                dfDict[mapIndexToKey[index]].append(np.nan)
            elif index == 21:
                dfDict[mapIndexToKey[index]].append(int(item == 'good'))
            elif index == 24:
                dfDict[mapIndexToKey[index]].append(int(item == 'ckd'))
            elif index in [5, 6]:
                dfDict[mapIndexToKey[index]].append(int(item == 'normal'))
            elif index in [7, 8]:
                dfDict[mapIndexToKey[index]].append(int(item == 'present'))
            elif index in [18, 19, 20, 22, 23]:
                dfDict[mapIndexToKey[index]].append(int(item == 'yes'))
            else:
                dfDict[mapIndexToKey[index]].append(float(item))
            index += 1
df = pd.DataFrame(dfDict)
df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,ckd
0,48.0,80.0,1.020,1.0,0.0,,1.0,0.0,0.0,121.0,...,44.0,7800.0,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1
1,7.0,50.0,1.020,4.0,0.0,,1.0,0.0,0.0,,...,38.0,6000.0,,0.0,0.0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,1.010,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31.0,7500.0,,0.0,1.0,0.0,0.0,0.0,1.0,1
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,1.010,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,140.0,...,47.0,6700.0,4.9,0.0,0.0,0.0,1.0,0.0,0.0,0
396,42.0,70.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,75.0,...,54.0,7800.0,6.2,0.0,0.0,0.0,1.0,0.0,0.0,0
397,12.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,100.0,...,49.0,6600.0,5.4,0.0,0.0,0.0,1.0,0.0,0.0,0
398,17.0,60.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,114.0,...,51.0,7200.0,5.9,0.0,0.0,0.0,1.0,0.0,0.0,0


In [3]:
cleanedDf = df[['age', 'bp', 'su', 'rbc', 'bgr', 'sod', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'ckd']]
for col in cleanedDf.columns:
    print(f'{col}: {df[col].unique()}')

age: [48.  7. 62. 51. 60. 68. 24. 52. 53. 50. 63. 40. 47. 61. 21. 42. 75. 69.
 nan 73. 70. 65. 76. 72. 82. 46. 45. 35. 54. 11. 59. 67. 15. 55. 44. 26.
 64. 56.  5. 74. 38. 58. 71. 34. 17. 12. 43. 41. 57.  8. 39. 66. 81. 14.
 27. 83. 30.  4.  3.  6. 32. 80. 49. 90. 78. 19.  2. 33. 36. 37. 23. 25.
 20. 29. 28. 22. 79.]
bp: [ 80.  50.  70.  90.  nan 100.  60. 110. 140. 180. 120.]
su: [ 0.  3.  4.  1. nan  2.  5.]
rbc: [nan  1.  0.]
bgr: [121.  nan 423. 117. 106.  74. 100. 410. 138.  70. 490. 380. 208.  98.
 157.  76.  99. 114. 263. 173.  95. 108. 156. 264. 123.  93. 107. 159.
 140. 171. 270.  92. 137. 204.  79. 207. 124. 144.  91. 162. 246. 253.
 141. 182.  86. 150. 146. 425. 112. 250. 360. 163. 129. 133. 102. 158.
 165. 132. 104. 127. 415. 169. 251. 109. 280. 210. 219. 295.  94. 172.
 101. 298. 153.  88. 226. 143. 115.  89. 297. 233. 294. 323. 125.  90.
 308. 118. 224. 128. 122. 214. 213. 268. 256.  84. 105. 288. 139.  78.
 273. 242. 424. 303. 148. 160. 192. 307. 220. 447. 309.  22. 111.

In [4]:
cleanedDf['age'] = cleanedDf['age'].apply(lambda age: 'Young' if age < 40 else 'Middle-Age' if age < 60 else 'Senior')
cleanedDf['bp'] = cleanedDf['bp'].apply(lambda bp: 'Normal' if bp < 80 else 'Stage1' if bp < 90 else 'Stage2')
cleanedDf['bgr'] = cleanedDf['bgr'].apply(lambda bgr: 'Normal' if bgr < 100 else 'Prediabetic' if bgr < 126 else 'Diabetic')
cleanedDf['sod'] = cleanedDf['sod'].apply(lambda sod: 'Low' if sod < 135 else 'Normal' if sod < 146 else 'High')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanedDf['age'] = cleanedDf['age'].apply(lambda age: 'Young' if age < 40 else 'Middle-Age' if age < 60 else 'Senior')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanedDf['bp'] = cleanedDf['bp'].apply(lambda bp: 'Normal' if bp < 80 else 'Stage1' if bp < 90 else 'Stage2')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

In [5]:
X = cleanedDf.drop(['ckd'], axis = 1)
y = cleanedDf['ckd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train['ckd'] = y_train
X_test['ckd'] = [np.nan] * len(X_test)

In [6]:
edges = [
    ('ckd', 'sod'),
    ('ckd', 'bp'),
    ('ckd', 'rbc'),
    ('appet', 'ckd'),
    ('age', 'ckd'),
    ('dm', 'ckd'),
    ('htn', 'ckd'),
    ('ane', 'ckd'),
    ('rbc', 'ane'),
    ('sod', 'htn'),
    ('bp', 'htn'),
    ('appet', 'htn'),
    ('age', 'htn'),
    ('htn', 'pe'),
    ('htn', 'cad'),
    ('dm', 'cad'),
    ('appet', 'cad'),
    ('age', 'cad'),
    ('su', 'dm'),
    ('bgr', 'dm'),
    ('appet', 'dm'),
    ('age', 'dm')
]
edges

[('ckd', 'sod'),
 ('ckd', 'bp'),
 ('ckd', 'rbc'),
 ('appet', 'ckd'),
 ('age', 'ckd'),
 ('dm', 'ckd'),
 ('htn', 'ckd'),
 ('ane', 'ckd'),
 ('rbc', 'ane'),
 ('sod', 'htn'),
 ('bp', 'htn'),
 ('appet', 'htn'),
 ('age', 'htn'),
 ('htn', 'pe'),
 ('htn', 'cad'),
 ('dm', 'cad'),
 ('appet', 'cad'),
 ('age', 'cad'),
 ('su', 'dm'),
 ('bgr', 'dm'),
 ('appet', 'dm'),
 ('age', 'dm')]

In [7]:
def get_EM(trainData):
    hc = HillClimbSearch(trainData)
    dag = hc.estimate(scoring_method="bic-d")

    model = DiscreteBayesianNetwork(dag.edges())

    em = ExpectationMaximization(model, trainData)
    em.model.fit(trainData)
    return em

em = get_EM(X_train)
for cpd in em.model.get_cpds():
    print(cpd)

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
  0%|          | 22/1000000 [00:01<19:47:53, 14.03it/s]
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 

+----------+----------+
| rbc(0.0) | 0.168539 |
+----------+----------+
| rbc(1.0) | 0.831461 |
+----------+----------+
+------------+--------------------+-----+---------------------+
| ckd        | ckd(0)             | ... | ckd(1)              |
+------------+--------------------+-----+---------------------+
| rbc        | rbc(0.0)           | ... | rbc(1.0)            |
+------------+--------------------+-----+---------------------+
| bp(Normal) | 0.3333333333333333 | ... | 0.5                 |
+------------+--------------------+-----+---------------------+
| bp(Stage1) | 0.3333333333333333 | ... | 0.16666666666666666 |
+------------+--------------------+-----+---------------------+
| bp(Stage2) | 0.3333333333333333 | ... | 0.3333333333333333  |
+------------+--------------------+-----+---------------------+
+------------------+-----+----------+----------------------+
| dm               | ... | dm(1.0)  | dm(1.0)              |
+------------------+-----+----------+-----------------

In [8]:
infer = VariableElimination(em.model)
for idx, row in X_test.iterrows():
    evidence = {col: row[col] for col in row.index if pd.notna(row[col])}
    missing_vars = row[row.isna()].index.tolist()
    if missing_vars:
        imputed = infer.map_query(variables=missing_vars, evidence=evidence)
        for var in missing_vars:
            X_test.at[idx, var] = imputed[var]
X_test

Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it

Unnamed: 0,age,bp,su,rbc,bgr,sod,htn,dm,cad,appet,pe,ane,ckd
154,Middle-Age,Stage2,3.0,0.0,Diabetic,Normal,1.0,1.0,0.0,0.0,1.0,1.0,1.0
73,Senior,Stage2,0.0,0.0,Diabetic,Low,1.0,0.0,0.0,1.0,1.0,1.0,1.0
371,Young,Normal,0.0,1.0,Normal,Normal,0.0,0.0,0.0,1.0,0.0,0.0,0.0
33,Senior,Stage2,0.0,0.0,Diabetic,High,1.0,0.0,0.0,0.0,0.0,0.0,1.0
244,Senior,Stage2,2.0,1.0,Diabetic,Normal,1.0,1.0,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,Middle-Age,Normal,0.0,1.0,Diabetic,Normal,1.0,1.0,0.0,1.0,0.0,0.0,1.0
273,Middle-Age,Stage1,0.0,1.0,Normal,Normal,0.0,0.0,0.0,1.0,0.0,0.0,0.0
113,Senior,Stage2,2.0,0.0,Diabetic,High,0.0,1.0,0.0,0.0,0.0,1.0,1.0
190,Young,Normal,0.0,0.0,Normal,Normal,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
y_true = y_test
y_pred = X_test['ckd']

f1_per_class = f1_score(y_true, y_pred, average=None)
f1_micro = f1_score(y_true, y_pred, average='micro')
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_weighted = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred)

print("F1 score per class:", f1_per_class)
print("Micro-average F1 score:", f1_micro)
print("Macro-average F1 score:", f1_macro)
print("Weighted-average F1 score:", f1_weighted)
print("Precision:", precision)

F1 score per class: [0.86046512 0.92207792]
Micro-average F1 score: 0.9
Macro-average F1 score: 0.8912715191784959
Weighted-average F1 score: 0.9030806402899426
Precision: 1.0


In [10]:
evidence = {
    'age': 'Senior',
    'dm': 1,
    'bp': 'Stage1',
    'appet': 0,
    'htn': 1
}

q = infer.query(variables=['ckd'], evidence=evidence)
print(q)

+--------+------------+
| ckd    |   phi(ckd) |
| ckd(0) |     0.0000 |
+--------+------------+
| ckd(1) |     1.0000 |
+--------+------------+


In [16]:
evidence = {
    'age': 'Senior',
    'dm': 0,
    'bp': 'Normal',
    'appet': 1,
    'htn': 0
}

q = infer.query(variables=['ckd'], evidence=evidence)
print(q)

+--------+------------+
| ckd    |   phi(ckd) |
| ckd(0) |     0.9409 |
+--------+------------+
| ckd(1) |     0.0591 |
+--------+------------+
