# Complex Bayesian Network

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% ! important; }<style>"))

#### Import dependencies

In [None]:
# # Import dependencies
# %pip install pgmpy
# %pip install tabulate
# %pip install pandas
# %pip install networkx
# %pip install matplotlib

#### Load libraries

In [None]:
# Load libraries
import pgmpy as pg
import tabulate as tb
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import sklearn

## Data import and cleaning

#### Import BigPerf data

In [None]:
# Import data
df=pd.read_csv('symptoms.csv', sep=',', header=0)

# df.head()

#### Define funtion to discretise data into three catagories

In [None]:
cough_labels = ['1. 0', '2. 1'] 
fever_labels = ['1. 0', '2. 1']
sob_labels = ['1. 0', '2. 1']
diarrhea_labels = ['1. 0', '2. 1']
fatigue_labels = ['1. 0', '2. 1']
headache_labels = ['1. 0', '2. 1']
loss_of_smell_labels = ['1. 0', '2. 1']
loss_of_taste_labels = ['1. 0', '2. 1']
runny_nose_labels = ['1. 0', '2. 1']
muscle_sore_labels = ['1. 0', '2. 1']
sore_throat_labels = ['1. 0', '2. 1']
covid_severity_labels = ['1. 1', '2. 2', '3. 3', '4. 4', '5. 5', '6. 6']

In [None]:
def make_discrete(df):

    discrete_df = pd.DataFrame()

    discrete_df['cough'] = pd.cut(df['cough'], bins=2, labels=cough_labels, precision=2)
    discrete_df['fever'] = pd.cut(df['fever'], bins=2, labels=fever_labels, precision=2)
    discrete_df['sob'] = pd.cut(df['sob'], bins=2, labels=sob_labels, precision=2)
    discrete_df['diarrhea'] = pd.cut(df['diarrhea'], bins=2, labels=diarrhea_labels, precision=2)
    discrete_df['fatigue'] = pd.cut(df['fatigue'], bins=2, labels=fatigue_labels, precision=2)
    discrete_df['headache'] = pd.cut(df['headache'], bins=2, labels=headache_labels, precision=2)
    discrete_df['loss_of_smell'] = pd.cut(df['loss_of_smell'], bins=2, labels=loss_of_smell_labels, precision=2)
    discrete_df['loss_of_taste'] = pd.cut(df['loss_of_taste'], bins=2, labels=loss_of_taste_labels, precision=2)
    discrete_df['runny_nose'] = pd.cut(df['runny_nose'], bins=2, labels=runny_nose_labels, precision=2)
    discrete_df['muscle_sore'] = pd.cut(df['muscle_sore'], bins=2, labels=muscle_sore_labels, precision=2)
    discrete_df['sore_throat'] = pd.cut(df['sore_throat'], bins=2, labels=sore_throat_labels, precision=2)
    discrete_df['covid_severity'] = pd.cut(df['covid_severity'], bins=6, labels=covid_severity_labels, precision=2)

    discrete_df = discrete_df.astype('object')

    return discrete_df

#### Discretise data into catagories

In [None]:
# Discretise clinical dataset
cat_df = make_discrete(df)

In [None]:
for column in cat_df:
    print(cat_df.groupby(column)[column].count().reset_index(name='Count').to_dict(orient='records'))

In [None]:
from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(cat_df, test_size=0.2, random_state=20)

## Building the model

#### Import dependencies

In [None]:
import networkx as nx
from pgmpy.models import BayesianNetwork, BayesianModel
from pgmpy.estimators import HillClimbSearch

#### Use structure learning to build a DAG

In [None]:
hc = HillClimbSearch(data=training_data)
estimate = hc.estimate(scoring_method='k2score')

#### Generate a Bayesian Network object using the learned DAG

In [None]:
model = BayesianNetwork(estimate)

#### Fit the created BN model to the BigPerf data, in order to create probability distributions

In [None]:
from pgmpy.estimators import BayesianEstimator, ExpectationMaximization, MaximumLikelihoodEstimator
from IPython.core.display import display, HTML

# disable text wrapping in output cell
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))

model.cpds = []

model.fit(data=training_data,
    estimator=BayesianEstimator,
    prior_type='BDeu',
    complete_samples_only=True)

In [None]:
import matplotlib.pyplot as plt

#### Plot the model as a graph

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 8))
G = nx.DiGraph()
G.add_edges_from(model.edges)
G.add_nodes_from(model.nodes)

pos = nx.circular_layout(G)  # Use circular layout
DAG = G.to_directed()
nx.topological_sort(DAG)

nx.draw_networkx(
    G,
    pos=pos,
    with_labels=True,
    node_size=5000,
    arrowsize=30,
    alpha=0.7,
    font_weight="bold",
    node_color='#7BCF83',
    width=2.0,
    ax=ax
)

tt_g = G.subgraph(nodes=['covid_severity'])
nx.draw(
    tt_g,
    pos=pos,
    with_labels=False,
    arrowsize=0,
    node_size=7100,
    alpha=0.7,
    font_weight="bold",
    node_color='#EE3B3B',
    ax=ax
)

plt.savefig('complexBayesian.pdf', facecolor='w', bbox_inches='tight')
plt.show()


## Scoring the model

#### Collect prediction accuracy scores for all variables

In [None]:
from pgmpy.metrics.metrics import correlation_score, log_likelihood_score, structure_score
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

accuracy_dict = {}

for column in testing_data:
    predict_data = testing_data.copy()
    predict_data.drop(column, axis=1, inplace=True)
    y_pred = model.predict(predict_data)

    accuracy = accuracy_score(testing_data[column], y_pred)

    print(f'{column} Accuracy score: {accuracy}')

    accuracy_dict[column] = accuracy

In [None]:
sum = 0
for v in accuracy_dict.values():
    sum += v

accuracy_dict['Average'] = sum / len(accuracy_dict.keys())

accuracy_dict

#### Get structure correlation scores

In [None]:
f1 = correlation_score(model=model, data=testing_data, test='chi_square', significance_level=0.05, score=f1_score, return_summary=False)
acc = correlation_score(model=model, data=testing_data, test='chi_square', significance_level=0.05, score=accuracy_score, return_summary=False)
pr = correlation_score(model=model, data=testing_data, test='chi_square', significance_level=0.05, score=precision_score, return_summary=False)
recall = correlation_score(model=model, data=testing_data, test='chi_square', significance_level=0.05, score=recall_score, return_summary=False)
ls = log_likelihood_score(model=model, data=testing_data)
ss = structure_score(model=model, data=testing_data, scoring_method='bdeu')


print(f'F1 score: {f1}')
print(f'Accuracy score: {acc}')
print(f'Precision score: {pr}')
print(f'Recall score: {recall}')
print(f'Log-likilihood score: {ls}')
print(f'Structure score: {ss}')

print(model.name)

## Conditional Probability Table (CPT)

In [None]:
print(f'Check model: {model.check_model()}\n')
for cpd in model.get_cpds():
    print(f'CPT of {cpd.variable}:')
    print(cpd * 100, '\n')