## 1. Initial Setup and Data Cleaning
Imports, cleaning up the dataset, filtering for features, discritizing the data, test-train splitting.

In [None]:
import numpy as np
import pandas as pd
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, ExpectationMaximization
from sklearn.model_selection import train_test_split
from pgmpy.inference import VariableElimination
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [None]:
dfDict = {
    'age': [], 'bp': [], 'sg': [], 'al': [], 'su': [],
    'rbc': [], 'pc': [], 'pcc': [], 'ba': [], 'bgr': [],
    'bu': [], 'sc': [], 'sod': [], 'pot': [], 'hemo': [],
    'pcv': [], 'wbcc': [], 'rbcc': [], 'htn': [], 'dm': [],
    'cad': [], 'appet': [], 'pe': [], 'ane': [], 'ckd': []
}
mapIndexToKey = dict(zip(np.arange(25), dfDict.keys()))
with open("dataset/chronic_kidney_disease.arff", "r") as f:
    for line in f:
        if line[0] == '@':
            continue
        line = line.strip()
        if line == '':
            continue
        line = line.replace('\t', '').split(',')
        index = 0
        for item in line:
            if item == '':
                continue
            if item == '?':
                dfDict[mapIndexToKey[index]].append(np.nan)
            elif index == 21:
                dfDict[mapIndexToKey[index]].append(int(item == 'good'))
            elif index == 24:
                dfDict[mapIndexToKey[index]].append(int(item == 'ckd'))
            elif index in [5, 6]:
                dfDict[mapIndexToKey[index]].append(int(item == 'normal'))
            elif index in [7, 8]:
                dfDict[mapIndexToKey[index]].append(int(item == 'present'))
            elif index in [18, 19, 20, 22, 23]:
                dfDict[mapIndexToKey[index]].append(int(item == 'yes'))
            else:
                dfDict[mapIndexToKey[index]].append(float(item))
            index += 1
df = pd.DataFrame(dfDict)
df

In [None]:
cleanedDf = df[['age', 'bp', 'su', 'rbc', 'bgr', 'sod', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'ckd']]
for col in cleanedDf.columns:
    print(f'{col}: {df[col].unique()}')

In [None]:
# Discritizing the continuous variables
cleanedDf['age'] = cleanedDf['age'].apply(lambda age: 'Young' if age < 40 else 'Middle-Age' if age < 60 else 'Senior')
cleanedDf['bp'] = cleanedDf['bp'].apply(lambda bp: 'Normal' if bp < 80 else 'Stage1' if bp < 90 else 'Stage2')
cleanedDf['bgr'] = cleanedDf['bgr'].apply(lambda bgr: 'Normal' if bgr < 100 else 'Prediabetic' if bgr < 126 else 'Diabetic')
cleanedDf['sod'] = cleanedDf['sod'].apply(lambda sod: 'Low' if sod < 135 else 'Normal' if sod < 146 else 'High')

In [None]:
X = cleanedDf.drop(['ckd'], axis = 1)
y = cleanedDf['ckd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train['ckd'] = y_train
# Create missing values in the test set for 'ckd'
X_test['ckd'] = [np.nan] * len(X_test)

## 2. Training the model
Use hill climb search to find the best DAG based on BIC-D scoring method, train the model as a discrete bayesian network using expectation maximization. Use variable elimination and probabilistic imputation to fill in missing values in the test set split and create our y_pred.

In [None]:
def get_EM(trainData):
    """
    Trains a Discrete Bayesian Network using Hill Climb Search to find the best DAG structure
    and Expectation Maximization to fit the model parameters.
    """
    hc = HillClimbSearch(trainData)
    dag = hc.estimate(scoring_method="bic-d")

    model = DiscreteBayesianNetwork(dag.edges())

    em = ExpectationMaximization(model, trainData)
    em.model.fit(trainData)
    return em

In [None]:
em = get_EM(X_train)
print(em.model.get_cpds('ckd'))

In [None]:
# Impute missing values in X_test using Variable Elimination
infer = VariableElimination(em.model)
for idx, row in X_test.iterrows():
    evidence = {col: row[col] for col in row.index if pd.notna(row[col])}
    missing_vars = row[row.isna()].index.tolist()
    if missing_vars:
        imputed = infer.map_query(variables=missing_vars, evidence=evidence)
        for var in missing_vars:
            X_test.at[idx, var] = imputed[var]
X_test

## 3. Measure accuracy and print out outputs and findings.
Print out F1 scores and precision scores, answer milestone 1 query.

In [None]:
y_true = y_test
y_pred = X_test['ckd']

f1_per_class = f1_score(y_true, y_pred, average=None)
f1_micro = f1_score(y_true, y_pred, average='micro')
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_weighted = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred)

print("F1 score per class:", f1_per_class)
print("Micro-average F1 score:", f1_micro)
print("Macro-average F1 score:", f1_macro)
print("Weighted-average F1 score:", f1_weighted)
print("Precision:", precision)

In [None]:
evidence = {
    'age': 'Senior',
    'dm': 1,
    'bp': 'Stage1',
    'appet': 0,
    'htn': 1
}

q = infer.query(variables=['ckd'], evidence=evidence)
print(q)