In [3]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib import colors

from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.estimators import HillClimbSearch
from pgmpy.estimators import BDeuScore

import seaborn as sns
import pandas as pd

import os
import pickle
import math

import pymc3 as pm
import arviz as az
import matplotlib.pyplot as plt
import networkx as nx

from tqdm import tqdm

import itertools as it

%matplotlib inline

In [6]:
BASE_EDGES = [('Smoking', 'Lung_Cancer'),
              ('Genetics', 'Lung_Cancer'),
              ('Genetics', 'Attention_Disorder'),
              ('Smoking', 'Yellow_Fingers'),
              ('Anxiety', 'Smoking'),
              ('Peer_Pressure', 'Smoking'),
              ('Allergy', 'Coughing'),
              ('Lung_Cancer', 'Coughing'),
              ('Coughing', 'Fatigue'),
              ('Lung_Cancer', 'Fatigue'),
              ('Fatigue', 'Car_Accident'),
              ('Attention_Disorder', 'Car_Accident')]

INPUT_FILE = './lucas0.csv'

MCMC_DIRECTORY = "./mcmc/"

SCENARIO_NAME = "SAIDA_GA_JOAO"

In [7]:
def plot_pair(pair,ax):
    fig = plt.figure(figsize=(3.5,4))
    
    ax = sns.violinplot(data=plot_data[pair],
                         cut=0, scale="width", palette=["blue", "red", "green"], ax=ax)
    ax.set(title=pair, ylim=(0,1), xticklabels=["Right", "No", "Left"])
    plt.close(fig)

    return fig

def calculate_avg_not_exists(item):
    return np.mean(item[1][:, 1])

def get_map(edge):
    data = plot_data[edge]
    probs = [np.mean(data[:, 0]), np.mean(data[:, 1]), np.average(data[:, 2])]
    direction = np.where(probs == np.amax(probs))
    
    return direction[0][0]

In [8]:
official_edges = BASE_EDGES

data = pd.read_csv(INPUT_FILE)
hc = HillClimbSearch(data)
hc_model = hc.estimate(scoring_method=BDeuScore(data))

bdeu = BDeuScore(pd.read_csv(INPUT_FILE), equivalent_sample_size=5)
print(bdeu.score(hc_model))

  0%|          | 0/1000000 [00:00<?, ?it/s]

-6185.291906837807


In [9]:
DIRECTORY = MCMC_DIRECTORY

possible_edges = [*filter(lambda x: ".pickle" in x and "mcmc_" in x, os.listdir(DIRECTORY))]

plot_data = dict()

for possible_edge in tqdm(possible_edges):
    edge_name = possible_edge.split("mcmc_")[1].split(".pickle")[0]
    node1, node2 = edge_name.split("-")

    with open(DIRECTORY + possible_edge, 'rb') as handle:
        model = pickle.load(handle)
        plot_data[edge_name] = model["trace"]["frac"]

sorted_x = dict(sorted(plot_data.items(), key=calculate_avg_not_exists))

edges = list()
for key in sorted_x.keys():
    node_left, node_right = key.split("-")
    maxima = get_map(key)    
    
    if maxima == 0:
        edges.append((node_left, node_right))
        
    if maxima == 2:
        edges.append((node_right, node_left))
        
node_names = list(data.columns)

model_struct = BayesianNetwork(ebunch=edges)
model_struct.nodes()
model_struct.fit(data=pd.read_csv(INPUT_FILE), estimator=MaximumLikelihoodEstimator)

bdeu = BDeuScore(pd.read_csv(INPUT_FILE), equivalent_sample_size=5)
print(bdeu.score(model_struct))

  0%|                                                     | 0/1 [00:00<?, ?it/s]


EOFError: Ran out of input