In [1]:
import pandas as pd

# Load your dataset (assuming it's a CSV file)
# Replace 'your_dataset.csv' with the actual path to your dataset
df = pd.read_csv('diabetes_dataset.csv')

# Finding min and max values of each column
min_max_values = df.agg(['min', 'max'])

# Print the result
print(min_max_values)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
min            0        0              0              0        0   0.0   
max           17      199            122             99      846  67.1   

     DiabetesPedigreeFunction  Age  Outcome  
min                     0.078   21        0  
max                     2.420   81        1  


### Binarize the data  <br>
**1. Pregnancies:**<br>
Threshold: >= 2 (Low/High)<br>
Reason: Women with more than one pregnancy are more likely to develop gestational diabetes or type 2 diabetes. A threshold of 2 can differentiate between those with few pregnancies and those with a history of multiple pregnancies.<br>
Binarization:<br>
0 for pregnancies < 2<br>
1 for pregnancies >= 2<br>
**2. Glucose:**<br>
Threshold: >= 140 mg/dL (Normal/High)<br>
Reason: Based on medical guidelines, a glucose level of 140 mg/dL or higher is generally considered a threshold for elevated blood glucose, often indicating a risk for diabetes.<br>
Binarization:<br>
0 for glucose < 140 mg/dL<br>
1 for glucose >= 140 mg/dL<br>
**3. Blood Pressure:**<br>
Threshold: >= 80 mmHg (Normal/High)<br>
Reason: The threshold of 80 mmHg diastolic pressure is based on hypertension guidelines. Blood pressure over 80 mmHg is considered high for diabetes patients.
Binarization:<br>
0 for blood pressure < 80 mmHg<br>
1 for blood pressure >= 80 mmHg<br>
**4. Skin Thickness:**<br>
Threshold: >= 20 mm (Low/High)<br>
Reason: There isn't a strict medical guideline, but generally, a thicker skin fold measurement (higher than 20mm) is associated with higher body fat percentage, which can be related to diabetes risk.<br>
Binarization:<br>
0 for skin thickness < 20 mm<br>
1 for skin thickness >= 20 mm<br>
**5. Insulin:**<br>
Threshold: >= 100 µU/mL (Low/High)<br>
Reason: Normal fasting insulin levels are below 100 µU/mL. Levels higher than this often indicate insulin resistance, a precursor to diabetes.
Binarization:<br>
0 for insulin < 100 µU/mL<br>
1 for insulin >= 100 µU/mL<br>
**6. BMI (Body Mass Index):**<br>
Threshold: >= 30 kg/m² (Normal/Obese)<br>
Reason: A BMI of 30 kg/m² or higher is generally classified as obese, which is a strong risk factor for diabetes.<br>
Binarization:<br>
0 for BMI < 30<br>
1 for BMI >= 30<br>
**7. Diabetes Pedigree Function:**<br>
Threshold: >= 0.5 (Low/High)<br>
Reason: The diabetes pedigree function indicates family history of diabetes. A value greater than 0.5 indicates a significant risk factor based on hereditary information.
Binarization:<br>
0 for pedigree function < 0.5<br>
1 for pedigree function >= 0.5<br>
**8. Age:**<br>
Threshold: >= 40 years (Young/Old)<br>
Reason: The risk of developing type 2 diabetes increases significantly after age 40, so this threshold makes sense for separating the population by risk.<br>
Binarization:<br>
0 for age < 40<br>
1 for age >= 40<br>

In [2]:
# Binarization thresholds
df['Pregnancies'] = (df['Pregnancies'] >= 2).astype(int)
df['Glucose'] = (df['Glucose'] >= 140).astype(int)
df['BloodPressure'] = (df['BloodPressure'] >= 80).astype(int)
df['SkinThickness'] = (df['SkinThickness'] >= 20).astype(int)
df['Insulin'] = (df['Insulin'] >= 100).astype(int)
df['BMI'] = (df['BMI'] >= 30).astype(int)
df['DiabetesPedigreeFunction'] = (df['DiabetesPedigreeFunction'] >= 0.5).astype(int)
df['Age'] = (df['Age'] >= 40).astype(int)

# Display binarized dataset
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  BMI  \
0            1        1              0              1        0    1   
1            0        0              0              1        0    0   
2            1        1              0              0        0    0   
3            0        0              0              1        0    0   
4            0        0              0              1        1    1   

   DiabetesPedigreeFunction  Age  Outcome  
0                         1    1        1  
1                         0    0        0  
2                         1    0        1  
3                         0    0        0  
4                         1    0        1  


In [3]:
from sklearn.model_selection import train_test_split
# Split the data into training (80%) and testing (20%)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

### Connection between Nodes ###

Pregnancies → Glucose → Insulin → Diabete
Age       → Glucose → Blood Pressure → Diabete
BMI       → Blood Pressure → Diabete 
BMI       → Skin Thickness → Diabetes Pedigree Function
Glucose   → Diabete

In [4]:
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD

model = BayesianNetwork([
    ('Age', 'Glucose'),
    ('Pregnancies', 'Glucose'),
    ('Glucose', 'Insulin'),
    ('Glucose', 'BloodPressure'),
    ('BMI', 'BloodPressure'),
    ('Glucose', 'Outcome'),
    ('BloodPressure', 'Outcome'),
    ('Insulin', 'Outcome'),
    ('BMI', 'Outcome'),
    ('BMI', 'SkinThickness'),
    ('SkinThickness', 'DiabetesPedigreeFunction')
])

ImportError: BayesianNetwork has been deprecated. Please use DiscreteBayesianNetwork instead.

Calculate the Conditional Probabilities: Use the formula for conditional probability:<br>
$$P(X | Y) = \frac{count(X \cap Y)}{count(Y)}$$

example:
$$P(Glucose=1|Age=1,Pregnancies=1) = \frac{count(Glucose=1,Age=1,Pregnancies=1)}{count(Age=1,Pregnancies=1)}$$

​


In [None]:
# Calculate CPD manually
def calculate_cpd(child, parents, df):
    # Get unique parent combinations
    parent_combinations = df[parents].drop_duplicates()

    cpd = {}
    
    for _, parent_values in parent_combinations.iterrows():
        # Filter data for the specific parent combination
        parent_condition = (df[parents] == parent_values.values).all(axis=1)
        
        # Calculate probabilities for each value of the child variable
        child_counts = df.loc[parent_condition, child].value_counts(normalize=True)
        
        # Store the result
        cpd[tuple(parent_values)] = child_counts.to_dict()

    return cpd

# Compute the CPD for Glucose given Age and Pregnancies
cpd_glucose = calculate_cpd('Glucose', ['Age', 'Pregnancies'], train_data)

# Print the resulting CPD
for parent_vals, prob_dict in cpd_glucose.items():
    print(f"P(Glucose | Age={parent_vals[0]}, Pregnancies={parent_vals[1]}): {prob_dict}")

P(Glucose | Age=0, Pregnancies=1): {0: 0.797752808988764, 1: 0.20224719101123595}
P(Glucose | Age=1, Pregnancies=1): {0: 0.5928571428571429, 1: 0.40714285714285714}
P(Glucose | Age=0, Pregnancies=0): {0: 0.827027027027027, 1: 0.17297297297297298}
P(Glucose | Age=1, Pregnancies=0): {0: 0.6363636363636364, 1: 0.36363636363636365}


In [None]:
# Calculate CPD automatically by using Estimator
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.estimators import HillClimbSearch, BicScore

# Use Maximum Likelihood Estimation to learn CPDs
model.fit(train_data, estimator=MaximumLikelihoodEstimator)

# Print the learned CPDs
for cpd in model.get_cpds():
    print(cpd)

+--------+----------+
| Age(0) | 0.736156 |
+--------+----------+
| Age(1) | 0.263844 |
+--------+----------+
+-------------+---------------------+-----+---------------------+
| Age         | Age(0)              | ... | Age(1)              |
+-------------+---------------------+-----+---------------------+
| Pregnancies | Pregnancies(0)      | ... | Pregnancies(1)      |
+-------------+---------------------+-----+---------------------+
| Glucose(0)  | 0.827027027027027   | ... | 0.5928571428571429  |
+-------------+---------------------+-----+---------------------+
| Glucose(1)  | 0.17297297297297298 | ... | 0.40714285714285714 |
+-------------+---------------------+-----+---------------------+
+----------------+----------+
| Pregnancies(0) | 0.337134 |
+----------------+----------+
| Pregnancies(1) | 0.662866 |
+----------------+----------+
+------------+--------------------+--------------------+
| Glucose    | Glucose(0)         | Glucose(1)         |
+------------+------------------

In [None]:
# Probability inference
from pgmpy.inference import VariableElimination
infer = VariableElimination(model)
# Example query: Probability of diabetes given Glucose level and Blood Pressure
result = infer.query(variables=['Outcome'], evidence={'Age':1,'Glucose': 1})
print(result)

+------------+----------------+
| Outcome    |   phi(Outcome) |
| Outcome(0) |         0.3797 |
+------------+----------------+
| Outcome(1) |         0.6203 |
+------------+----------------+


In [None]:
from sklearn.metrics import accuracy_score
# Predict the 'Outcome' (Diabetes) for each row in the test set
predictions = []
for index, row in test_data.iterrows():
    # Use evidence from the other columns to predict the Outcome
    evidences = {
        'Glucose': row['Glucose'],
        'BloodPressure': row['BloodPressure'],
        'Insulin': row['Insulin'],
        'BMI': row['BMI']
    }
    
    # Query the model to get the predicted 'Outcome'
    result = infer.map_query(variables=['Outcome'], evidence=evidences)
    predictions.append(result['Outcome'])

# Calculate accuracy by comparing the predicted and actual labels
accuracy = accuracy_score(test_data['Outcome'], predictions)

print(f"Accuracy on test data: {accuracy * 100:.2f}%")

Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it

Accuracy on test data: 77.27%



