In [1]:
import numpy as np
import pandas as pd
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, BayesianEstimator, ExpectationMaximization

In [2]:
dfDict = {
    'age': [], 'bp': [], 'sg': [], 'al': [], 'su': [],
    'rbc': [], 'pc': [], 'pcc': [], 'ba': [], 'bgr': [],
    'bu': [], 'sc': [], 'sod': [], 'pot': [], 'hemo': [],
    'pcv': [], 'wbcc': [], 'rbcc': [], 'htn': [], 'dm': [],
    'cad': [], 'appet': [], 'pe': [], 'ane': [], 'class': []
}
mapIndexToKey = dict(zip(np.arange(25), dfDict.keys()))
with open("dataset/chronic_kidney_disease.arff", "r") as f:
    for line in f:
        if line[0] == '@':
            continue
        line = line.strip()
        if line == '':
            continue
        line = line.replace('\t', '').split(',')
        index = 0
        for item in line:
            if item == '':
                continue
            if item == '?':
                dfDict[mapIndexToKey[index]].append(np.nan)
            elif index == 21:
                dfDict[mapIndexToKey[index]].append(int(item == 'good'))
            elif index == 24:
                dfDict[mapIndexToKey[index]].append(int(item == 'ckd'))
            elif index in [5, 6]:
                dfDict[mapIndexToKey[index]].append(int(item == 'normal'))
            elif index in [7, 8]:
                dfDict[mapIndexToKey[index]].append(int(item == 'present'))
            elif index in [18, 19, 20, 22, 23]:
                dfDict[mapIndexToKey[index]].append(int(item == 'yes'))
            else:
                dfDict[mapIndexToKey[index]].append(float(item))
            index += 1
df = pd.DataFrame(dfDict)
df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1.0,0.0,,1.0,0.0,0.0,121.0,...,44.0,7800.0,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1
1,7.0,50.0,1.020,4.0,0.0,,1.0,0.0,0.0,,...,38.0,6000.0,,0.0,0.0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,1.010,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31.0,7500.0,,0.0,1.0,0.0,0.0,0.0,1.0,1
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,1.010,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,140.0,...,47.0,6700.0,4.9,0.0,0.0,0.0,1.0,0.0,0.0,0
396,42.0,70.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,75.0,...,54.0,7800.0,6.2,0.0,0.0,0.0,1.0,0.0,0.0,0
397,12.0,80.0,1.020,0.0,0.0,1.0,1.0,0.0,0.0,100.0,...,49.0,6600.0,5.4,0.0,0.0,0.0,1.0,0.0,0.0,0
398,17.0,60.0,1.025,0.0,0.0,1.0,1.0,0.0,0.0,114.0,...,51.0,7200.0,5.9,0.0,0.0,0.0,1.0,0.0,0.0,0


In [3]:
cleanedDf = df[['age', 'bp', 'su', 'rbc', 'bgr', 'sod', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']]
# Rename 'class' column to 'ckd' to match the edge definitions
cleanedDf = cleanedDf.rename(columns={'class': 'ckd'})
cleanedDf

Unnamed: 0,age,bp,su,rbc,bgr,sod,htn,dm,cad,appet,pe,ane,ckd
0,48.0,80.0,0.0,,121.0,,1.0,1.0,0.0,1.0,0.0,0.0,1
1,7.0,50.0,0.0,,,,0.0,0.0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,3.0,1.0,423.0,,0.0,1.0,0.0,0.0,0.0,1.0,1
3,48.0,70.0,0.0,1.0,117.0,111.0,1.0,0.0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,0.0,1.0,106.0,,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,0.0,1.0,140.0,150.0,0.0,0.0,0.0,1.0,0.0,0.0,0
396,42.0,70.0,0.0,1.0,75.0,141.0,0.0,0.0,0.0,1.0,0.0,0.0,0
397,12.0,80.0,0.0,1.0,100.0,137.0,0.0,0.0,0.0,1.0,0.0,0.0,0
398,17.0,60.0,0.0,1.0,114.0,135.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [4]:
edges = [
    ('ckd', 'sod'),
    ('ckd', 'bp'),
    ('ckd', 'rbc'),
    ('appet', 'ckd'),
    ('age', 'ckd'),
    ('dm', 'ckd'),
    ('htn', 'ckd'),
    ('ane', 'ckd'),
    ('rbc', 'ane'),
    ('sod', 'htn'),
    ('bp', 'htn'),
    ('appet', 'htn'),
    ('age', 'htn'),
    ('htn', 'pe'),
    ('htn', 'cad'),
    ('dm', 'cad'),
    ('appet', 'cad'),
    ('age', 'cad'),
    ('su', 'dm'),
    ('bgr', 'dm'),
    ('appet', 'dm'),
    ('age', 'dm')
]
edges

[('ckd', 'sod'),
 ('ckd', 'bp'),
 ('ckd', 'rbc'),
 ('appet', 'ckd'),
 ('age', 'ckd'),
 ('dm', 'ckd'),
 ('htn', 'ckd'),
 ('ane', 'ckd'),
 ('rbc', 'ane'),
 ('sod', 'htn'),
 ('bp', 'htn'),
 ('appet', 'htn'),
 ('age', 'htn'),
 ('htn', 'pe'),
 ('htn', 'cad'),
 ('dm', 'cad'),
 ('appet', 'cad'),
 ('age', 'cad'),
 ('su', 'dm'),
 ('bgr', 'dm'),
 ('appet', 'dm'),
 ('age', 'dm')]

In [5]:
def score_function(method='bic-d'):
    hillClimb = HillClimbSearch(
        cleanedDf
    )
    return hillClimb.estimate(scoring_method = method)

In [6]:
hc = HillClimbSearch(
    cleanedDf
)
hillClimb = score_function(method='bic-d')
hillClimbedEdges = hillClimb.edges
hillClimbNodes = hillClimb.nodes

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [7]:
hillClimbedEdges

OutEdgeView([('rbc', 'age'), ('rbc', 'bgr'), ('rbc', 'bp'), ('rbc', 'sod'), ('rbc', 'ckd'), ('rbc', 'dm'), ('rbc', 'htn'), ('rbc', 'appet'), ('rbc', 'su'), ('rbc', 'pe'), ('rbc', 'ane'), ('rbc', 'cad'), ('htn', 'ckd'), ('dm', 'su'), ('cad', 'htn'), ('ckd', 'dm'), ('ckd', 'appet'), ('ckd', 'pe'), ('ckd', 'ane')])

In [8]:
model = DiscreteBayesianNetwork(hillClimbedEdges)

In [9]:
em = ExpectationMaximization(model, cleanedDf)
em.model.fit(cleanedDf)

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}


<pgmpy.models.DiscreteBayesianNetwork.DiscreteBayesianNetwork at 0x1427a266d50>

In [10]:
for cpd in em.model.get_cpds(): # type: ignore
    print(cpd)

+----------+----------+
| rbc(0.0) | 0.189516 |
+----------+----------+
| rbc(1.0) | 0.810484 |
+----------+----------+
+-----------+----------------------+----------+
| rbc       | rbc(0.0)             | rbc(1.0) |
+-----------+----------------------+----------+
| age(2.0)  | 0.0                  | 0.005    |
+-----------+----------------------+----------+
| age(3.0)  | 0.0                  | 0.005    |
+-----------+----------------------+----------+
| age(4.0)  | 0.0                  | 0.0      |
+-----------+----------------------+----------+
| age(5.0)  | 0.0                  | 0.0      |
+-----------+----------------------+----------+
| age(6.0)  | 0.022727272727272728 | 0.0      |
+-----------+----------------------+----------+
| age(7.0)  | 0.0                  | 0.0      |
+-----------+----------------------+----------+
| age(8.0)  | 0.0                  | 0.01     |
+-----------+----------------------+----------+
| age(11.0) | 0.0                  | 0.0      |
+-----------+---

In [11]:
def get_EM(X_train, y_train):
    hillClimbedEdges = hillClimb.edges
    X_train['ckd'] = y_train
    model = DiscreteBayesianNetwork(hillClimbedEdges)
    em = ExpectationMaximization(model, X_train)
    em.model.fit(X_train)
    return em

In [12]:
from sklearn.model_selection import train_test_split
X = cleanedDf[['age', 'bp', 'su', 'rbc', 'bgr', 'sod', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']]
y = cleanedDf['ckd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [13]:
new_em = get_EM(X_train, y_train)
new_em_model = new_em.model

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'bp': 'N', 'su': 'N', 'rbc': 'N', 'bgr': 'N', 'sod': 'N', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}


In [14]:
for cpd in new_em_model.get_cpds(): # type: ignore
    print(cpd)

+----------+---------+
| rbc(0.0) | 0.19186 |
+----------+---------+
| rbc(1.0) | 0.80814 |
+----------+---------+
+-----------+---------------------+----------------------+
| rbc       | rbc(0.0)            | rbc(1.0)             |
+-----------+---------------------+----------------------+
| age(2.0)  | 0.0                 | 0.007246376811594203 |
+-----------+---------------------+----------------------+
| age(4.0)  | 0.0                 | 0.0                  |
+-----------+---------------------+----------------------+
| age(5.0)  | 0.0                 | 0.0                  |
+-----------+---------------------+----------------------+
| age(6.0)  | 0.03225806451612903 | 0.0                  |
+-----------+---------------------+----------------------+
| age(7.0)  | 0.0                 | 0.0                  |
+-----------+---------------------+----------------------+
| age(8.0)  | 0.0                 | 0.014492753623188406 |
+-----------+---------------------+----------------------+


In [15]:
def prob_impute(sample):
 sample = sample.copy()

 # Step 1: Find the number of missing child heights for that gender.
 num_null = sample.isna().sum()

 # Step 2: Sample num_null observed child heights for that gender.
 fill_values = np.random.choice(sample.dropna(), num_null)

 # Step 3: Fill in missing values and return ser.
 sample[sample.isna()] = fill_values
 return sample

In [16]:
for c in X_test.columns:
    print(c)
    X_test[c] = prob_impute(X_test[c])

age
bp
su
rbc
bgr
sod
htn
dm
cad
appet
pe
ane


In [17]:
y_pred = new_em_model.predict(X_test)

  0%|          | 0/120 [00:00<?, ?it/s]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
from sklearn.metrics import f1_score

y_true = [0, 1, 2, 2, 2, 2, 1, 0, 2, 1, 0]
y_pred = [0, 0, 2, 2, 1, 2, 1, 0, 1, 2, 1]

f1_per_class = f1_score(y_true, y_pred, average=None)
f1_micro = f1_score(y_true, y_pred, average='micro')
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_weighted = f1_score(y_true, y_pred, average='weighted')

print("F1 score per class:", f1_per_class)
print("Micro-average F1 score:", f1_micro)
print("Macro-average F1 score:", f1_macro)
print("Weighted-average F1 score:", f1_weighted)

In [None]:
from sklearn.metrics import precision_score

y_true = []
y_pred = []

precision = precision_score(y_true, y_pred)
print("Precision:", precision)

In [19]:
# Discretization utilities
import numpy as np
import pandas as pd

# Columns that are continuous (adjust if needed)
CONTINUOUS_COLS = ['age', 'bp', 'bgr', 'sod']  # su/rbc are already low-cardinality; keep as-is
TARGET_COL = 'ckd'

def compute_bin_edges(series: pd.Series, n_bins: int = 4):
    """Compute quantile-based bin edges for a training series, expanding to +/- inf."""
    quantiles = np.linspace(0, 1, n_bins + 1)
    vals = series.dropna().quantile(quantiles).values
    vals[0] = -np.inf
    vals[-1] = np.inf
    # Ensure strictly increasing
    edges = np.unique(vals)
    if len(edges) < 3:  # Fallback to unique sorted values extended
        unique_vals = np.sort(series.dropna().unique())
        edges = np.concatenate(([-np.inf], unique_vals, [np.inf]))
    return edges

def apply_edges(series: pd.Series, edges):
    return pd.cut(series, bins=edges, include_lowest=True).astype(str)

def discretize_train_test(train_df: pd.DataFrame, test_df: pd.DataFrame, continuous_cols=None, n_bins: int = 4):
    if continuous_cols is None:
        continuous_cols = CONTINUOUS_COLS
    bin_edges = {}
    train_disc = train_df.copy()
    test_disc = test_df.copy()
    for col in continuous_cols:
        edges = compute_bin_edges(train_df[col], n_bins=n_bins)
        bin_edges[col] = edges
        train_disc[col] = apply_edges(train_df[col], edges)
        test_disc[col] = apply_edges(test_df[col], edges)
    # Ensure remaining non-target cols are strings (pgmpy discrete states) if not numeric small-cardinality
    for df_tmp in [train_disc, test_disc]:
        for col in df_tmp.columns:
            if col == TARGET_COL:
                continue
            if df_tmp[col].dtype.kind in ['i','f'] and df_tmp[col].nunique() > 10:
                df_tmp[col] = df_tmp[col].astype(str)
    return train_disc, test_disc, bin_edges

print("Discretization utilities ready.")

Discretization utilities ready.


In [24]:
# Helper functions for structure learning, EM fitting, prediction, and probability extraction
from pgmpy.estimators import HillClimbSearch, ExpectationMaximization
from pgmpy.inference import VariableElimination
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

def learn_structure(train_df):
    hc = HillClimbSearch(train_df)
    dag = hc.estimate(scoring_method='bic-d')
    return dag

def fit_em(dag, train_df):
    model = DiscreteBayesianNetwork(dag.edges())
    em = ExpectationMaximization(model, train_df)
    em.model.fit(train_df)
    return em.model

def predict_ckd(model, test_evidence_df):
    """Predict target by providing evidence WITHOUT the target column."""
    predicted = model.predict(test_evidence_df)
    return predicted[TARGET_COL].astype(int).values

def predict_prob(model, row):
    infer = VariableElimination(model)
    # Exclude target and any NaN evidence variables
    evidence = {col: row[col] for col in row.index if col != TARGET_COL and pd.notna(row[col])}
    q = infer.query([TARGET_COL], evidence=evidence)
    state_names = list(q.state_names[TARGET_COL])
    # Try integer states first
    if 1 in state_names:
        idx = state_names.index(1)
    elif '1' in state_names:
        idx = state_names.index('1')
    else:
        # Heuristic: choose state with max value assuming higher means CKD
        try:
            # Map numeric-like states
            numeric_states = [float(s) if not isinstance(s, (int, float)) else float(s) for s in state_names]
            idx = int(np.argmax(numeric_states))
        except Exception:
            idx = -1
    return float(q.values[idx])

print("Helper functions updated (NaN evidence skipped).")

Helper functions updated (NaN evidence skipped).


In [25]:
# Stratified K-Fold Cross Validation for Bayesian Network
from collections import defaultdict

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

X_all = cleanedDf.drop(columns=[TARGET_COL])
y_all = cleanedDf[TARGET_COL]

fold_metrics = []
probabilities = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X_all, y_all), 1):
    train_df = cleanedDf.iloc[train_idx].copy()
    test_df = cleanedDf.iloc[test_idx].copy()

    # Discretize (fit on train, apply to test)
    train_disc, test_disc, bin_edges = discretize_train_test(train_df, test_df, continuous_cols=CONTINUOUS_COLS, n_bins=4)

    # Learn structure on discretized training data
    dag = learn_structure(train_disc)

    # Fit EM on discretized training data
    model = fit_em(dag, train_disc)

    # Predict labels
    y_pred = predict_ckd(model, test_disc.drop(columns=[TARGET_COL]))
    y_true = test_disc[TARGET_COL].astype(int).values

    # Probability for each sample
    for _, row in test_disc.iterrows():
        p = predict_prob(model, row)
        probabilities.append({'fold': fold, 'prob_ckd': p, 'true': int(row[TARGET_COL])})

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='binary')
    fold_metrics.append({'fold': fold, 'accuracy': acc, 'f1': f1})
    print(f"Fold {fold}: accuracy={acc:.3f} f1={f1:.3f}")

# Aggregate metrics
acc_mean = np.mean([m['accuracy'] for m in fold_metrics])
f1_mean = np.mean([m['f1'] for m in fold_metrics])
print(f"\nMean accuracy: {acc_mean:.3f}")
print(f"Mean F1: {f1_mean:.3f}")

# Simple probability calibration insight
probs_df = pd.DataFrame(probabilities)
print(probs_df.head())
print("Probability stats:")
print(probs_df.groupby('true')['prob_ckd'].describe())

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 

  0%|          | 0/1000000 [00:00<?, ?it/s]

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}


  0%|          | 0/75 [00:00<?, ?it/s]

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 

Fold 1: accuracy=0.950 f1=0.958


  0%|          | 0/1000000 [00:00<?, ?it/s]

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}


  0%|          | 0/77 [00:00<?, ?it/s]

  phi.values = phi.values / (phi.values.sum())
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C

Fold 2: accuracy=0.887 f1=0.901


  0%|          | 0/1000000 [00:00<?, ?it/s]

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}


  0%|          | 0/77 [00:00<?, ?it/s]

  phi.values = phi.values / (phi.values.sum())
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C

Fold 3: accuracy=0.887 f1=0.901


  0%|          | 0/1000000 [00:00<?, ?it/s]

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}


  0%|          | 0/76 [00:00<?, ?it/s]

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 

Fold 4: accuracy=0.912 f1=0.925


  0%|          | 0/1000000 [00:00<?, ?it/s]

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}


  0%|          | 0/76 [00:00<?, ?it/s]

Fold 5: accuracy=0.938 f1=0.947

Mean accuracy: 0.915
Mean F1: 0.927
   fold  prob_ckd  true
0     1  0.397685     1
1     1  1.000000     1
2     1  1.000000     1
3     1  1.000000     1
4     1  1.000000     1
Probability stats:
      count      mean       std       min       25%       50%       75%  \
true                                                                      
0     150.0  0.019230  0.038864  0.000814  0.002888  0.010409  0.024848   
1     244.0  0.895178  0.287563  0.001933  1.000000  1.000000  1.000000   

           max  
true            
0     0.351565  
1     1.000000  


  phi.values = phi.values / (phi.values.sum())


In [31]:
cpd = model.get_cpds('ckd')
print(cpd)

+--------+----------+---------------------+----------+----------+
| htn    | htn(0.0) | htn(0.0)            | htn(1.0) | htn(1.0) |
+--------+----------+---------------------+----------+----------+
| rbc    | rbc(0.0) | rbc(1.0)            | rbc(0.0) | rbc(1.0) |
+--------+----------+---------------------+----------+----------+
| ckd(0) | 0.0      | 0.8740157480314961  | 0.0      | 0.0      |
+--------+----------+---------------------+----------+----------+
| ckd(1) | 1.0      | 0.12598425196850394 | 1.0      | 1.0      |
+--------+----------+---------------------+----------+----------+


In [28]:
# Compute P(ckd | htn, dm, ane, appet, age) via inference over parents
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.inference import VariableElimination
import itertools

CKD_PARENTS = ['htn','dm','ane','appet','age']

full_df = cleanedDf.copy()
full_disc_age_edges = compute_bin_edges(full_df['age'], n_bins=4)
full_df['age'] = apply_edges(full_df['age'], full_disc_age_edges)

for col in CKD_PARENTS:
    if full_df[col].dtype.kind in ['f']:
        full_df[col] = full_df[col].astype(str)

ckd_edges = [(p, 'ckd') for p in CKD_PARENTS]
ckd_model = DiscreteBayesianNetwork(ckd_edges)
ckd_model.fit(full_df[CKD_PARENTS + ['ckd']], estimator=BayesianEstimator, prior_type='BDeu', equivalent_sample_size=1)

infer_ckd = VariableElimination(ckd_model)
ckd_cpd = ckd_model.get_cpds('ckd')
print("CKD CPD summary:")
print(ckd_cpd)

# Parent states (filter out any nan-like state strings)
parent_states_clean = {}
for p in CKD_PARENTS:
    states = [s for s in ckd_cpd.state_names.get(p, []) if str(s) != 'nan']
    parent_states_clean[p] = states

rows = []
for combo in itertools.product(*[parent_states_clean[p] for p in CKD_PARENTS]):
    evidence = {p: combo[i] for i, p in enumerate(CKD_PARENTS)}
    q = infer_ckd.query(['ckd'], evidence=evidence)
    state_names = list(q.state_names['ckd'])
    if 1 in state_names:
        idx1 = state_names.index(1)
    elif '1' in state_names:
        idx1 = state_names.index('1')
    else:
        # choose the state with higher numeric value as positive
        numeric_states = [float(s) if not isinstance(s,(int,float)) else float(s) for s in state_names]
        idx1 = int(max(range(len(numeric_states)), key=lambda k: numeric_states[k]))
    prob_ckd1 = float(q.values[idx1])
    row = dict(evidence)
    row['P(ckd=1|parents)'] = prob_ckd1
    rows.append(row)

ckd_cond_df = pd.DataFrame(rows)
print("\nFirst 10 conditional probabilities:")
print(ckd_cond_df)

# Example: probability when htn=1, dm=1, ane=1, appet=1, age highest bin
highest_age_bin = sorted(parent_states_clean['age'])[-1]
example = ckd_cond_df[(ckd_cond_df['htn']==1) & (ckd_cond_df['dm']==1) & (ckd_cond_df['ane']==1) & (ckd_cond_df['appet']==1) & (ckd_cond_df['age']==highest_age_bin)]
print(f"\nExample P(ckd=1 | htn=1, dm=1, ane=1, appet=1, age={highest_age_bin}) =", example['P(ckd=1|parents)'].values)


INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'htn': 'C', 'dm': 'C', 'ane': 'C', 'appet': 'C', 'age': 'C', 'ckd': 'N'}


CKD CPD summary:
+--------+-----------------------+-----+------------+------------+
| age    | age((-inf, 42.0])     | ... | age(nan)   | age(nan)   |
+--------+-----------------------+-----+------------+------------+
| ane    | ane(0.0)              | ... | ane(nan)   | ane(nan)   |
+--------+-----------------------+-----+------------+------------+
| appet  | appet(0.0)            | ... | appet(nan) | appet(nan) |
+--------+-----------------------+-----+------------+------------+
| dm     | dm(0.0)               | ... | dm(nan)    | dm(nan)    |
+--------+-----------------------+-----+------------+------------+
| htn    | htn(0.0)              | ... | htn(1.0)   | htn(nan)   |
+--------+-----------------------+-----+------------+------------+
| ckd(0) | 0.0003084515731030228 | ... | 0.5        | 0.5        |
+--------+-----------------------+-----+------------+------------+
| ckd(1) | 0.999691548426897     | ... | 0.5        | 0.5        |
+--------+-----------------------+-----+-----

In [None]:
# Compare CKD conditional probabilities: Minimal Bayesian CPD vs Full EM Model (with dtype harmonization)
from pgmpy.estimators import ExpectationMaximization
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.inference import VariableElimination
import pandas as pd
import numpy as np
import itertools

print("\n=== Building full EM model on discretized data ===")
full_train_disc, full_test_disc, full_edges_map = discretize_train_test(cleanedDf, cleanedDf, continuous_cols=CONTINUOUS_COLS, n_bins=4)
full_disc = full_train_disc
for col in full_disc.columns:
    if col != TARGET_COL and full_disc[col].dtype.kind in ['f','i'] and full_disc[col].nunique() > 10:
        full_disc[col] = full_disc[col].astype(str)

try:
    full_edges = list(hillClimbedEdges)
except Exception:
    hc_tmp = HillClimbSearch(full_disc)
    full_dag = hc_tmp.estimate(scoring_method='bic-d')
    full_edges = list(full_dag.edges())

full_model = DiscreteBayesianNetwork(full_edges)
em_full = ExpectationMaximization(full_model, full_disc)
em_full.model.fit(full_disc)
print("Full EM model fitted.")

ckd_parents_full = em_full.model.get_parents(TARGET_COL)
print("CKD parents in full EM model:", ckd_parents_full)

desired_parents = CKD_PARENTS
missing = set(desired_parents) - set(ckd_parents_full)
extra = set(ckd_parents_full) - set(desired_parents)
if missing:
    print("WARNING: Desired parents missing in full model:", missing)
if extra:
    print("NOTE: Full model has extra parents; will marginalize them:", extra)

infer_full = VariableElimination(em_full.model)
parent_states_clean_full = {}
for p in desired_parents:
    try:
        cpds = em_full.model.get_cpds(p)
        states = [s for s in cpds.state_names[p] if str(s) != 'nan']
    except Exception:
        states = [s for s in full_disc[p].unique() if str(s) != 'nan']
    parent_states_clean_full[p] = states

rows_full = []
for combo in itertools.product(*[parent_states_clean_full[p] for p in desired_parents]):
    evidence = {p: combo[i] for i, p in enumerate(desired_parents)}
    q = infer_full.query([TARGET_COL], evidence=evidence)
    state_names = list(q.state_names[TARGET_COL])
    if 1 in state_names:
        idx1 = state_names.index(1)
    elif '1' in state_names:
        idx1 = state_names.index('1')
    else:
        numeric_states = [float(s) if not isinstance(s,(int,float)) else float(s) for s in state_names]
        idx1 = int(np.argmax(numeric_states))
    prob_ckd1 = float(q.values[idx1])
    row = dict(evidence)
    row['P_full_EM(ckd=1|parents)'] = prob_ckd1
    rows_full.append(row)

full_cond_df = pd.DataFrame(rows_full)
print("Full EM conditional probabilities (head):")
print(full_cond_df.head())

# Harmonize dtypes before merge
numeric_parents = ['htn','dm','ane','appet']
for col in numeric_parents:
    if col in full_cond_df:
        full_cond_df[col] = full_cond_df[col].astype(float)
if 'ckd_cond_df' in globals():
    for col in numeric_parents:
        if col in ckd_cond_df:
            ckd_cond_df[col] = ckd_cond_df[col].astype(float)

if 'ckd_cond_df' in globals():
    merged = pd.merge(ckd_cond_df, full_cond_df, on=desired_parents, how='inner')
    merged['abs_diff'] = (merged['P(ckd=1|parents)'] - merged['P_full_EM(ckd=1|parents)']).abs()
    print("\nMerged comparison (head):")
    print(merged.head())
    print("\nMean absolute difference:", merged['abs_diff'].mean())
else:
    print("ckd_cond_df not found; cannot compare. Run the minimal CPD cell first.")

age_high = sorted(parent_states_clean_full['age'])[-1]
example = full_cond_df[(full_cond_df['htn']==1.0) & (full_cond_df['dm']==1.0) & (full_cond_df['ane']==1.0) & (full_cond_df['appet']==1.0) & (full_cond_df['age']==age_high)]
print(f"\nFull model P(ckd=1 | htn=1, dm=1, ane=1, appet=1, age={age_high}) =", example['P_full_EM(ckd=1|parents)'].values)


INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'C', 'bp': 'C', 'su': 'N', 'rbc': 'N', 'bgr': 'C', 'sod': 'C', 'htn': 'N', 'dm': 'N', 'cad': 'N', 'appet': 'N', 'pe': 'N', 'ane': 'N', 'ckd': 'N'}



=== Building full EM model on discretized data ===
Full EM model fitted.
CKD parents in full EM model: ['rbc', 'htn']
NOTE: Full model has extra parents; will marginalize them: {'rbc'}
Full EM conditional probabilities (head):
   htn   dm  ane  appet           age  P_full_EM(ckd=1|parents)
0  0.0  0.0  0.0    0.0  (-inf, 42.0]                  1.000000
1  0.0  0.0  0.0    0.0  (42.0, 55.0]                  1.000000
2  0.0  0.0  0.0    0.0  (55.0, 64.5]                  1.000000
3  0.0  0.0  0.0    0.0   (64.5, inf]                  1.000000
4  0.0  0.0  0.0    1.0  (-inf, 42.0]                  0.045961


ValueError: You are trying to merge on object and float64 columns for key 'htn'. If you wish to proceed you should use pd.concat