### Theory

Initial State Probability:

$P(s_{0}) = P(Pain, Fatigue, Gender, Age) = P(P,F,G,A)$   

$P(s_{0}) = P(P,F /G,A)$ 

$P(s_{0}) = P(P/G,A) \times P(F/G,A) \times P(G) \times P(A)$ 


Considerations:

1. We can either define Current_Daytime_Interval node if we are training with the entire dataset, or we can remove the node and train with wake states only.

2. For initial states, Lact_Activity_Bout == None

3. The location of the valid states (96) in the total state array (3072) will remain the same, but their probabilities will be different each time we sample.

### Libraries

In [1]:
import pickle
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator
import networkx as nx
import pylab as plt
from pgmpy.inference import VariableElimination
import pdb
import seaborn as sns
import matplotlib.pyplot as plt   
from sklearn.metrics import confusion_matrix
from networkx.drawing.nx_agraph import graphviz_layout
import numpy as np
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score, accuracy_score
import os

### Functions

In [13]:
def wake_train(data_full):
    wake_df = data_full[['Gender', 'Age','Pain_Score','Fatigue_Score','Daytime_Interval']]
    #Drop Daytime_Intervals != Wake
    indexNames = wake_df[ wake_df['Daytime_Interval'] != 'Wake' ].index
    wake_df.drop(indexNames , inplace=True)
    wake_df = wake_df[['Gender', 'Age','Pain_Score','Fatigue_Score']]
    return wake_df

def age_probability(seed):
    np.random.seed(seed)
    age_cpds = model.get_cpds("Age")
    age_prob = age_cpds.values
    age_dict = age_cpds.name_to_no
    p_a = np.zeros(3)
    for key,val in  age_dict['Age'].items():
        prob = age_prob[val]
        #Draw samples from uniform distribution defined by probabilities:
        p_a[val] = np.random.uniform(low=0.0, high=prob)
    #Normalize the array:
    p_a = p_a/np.sum(p_a)
    #Create dictionary
    age_prob_dict = {}
    for key,val in age_dict['Age'].items():
        age_prob_dict[key] = p_a[val]
    return age_prob_dict

def gender_probability(seed):
    np.random.seed(seed)
    gender_cpds = model.get_cpds("Gender")
    gender_prob = gender_cpds.values
    gender_dict = gender_cpds.name_to_no
    p_g = np.zeros(2)
    for key,val in  gender_dict['Gender'].items():
        prob = gender_prob[val]
        #Draw samples from uniform distribution defined by probabilities:
        p_g[val] = np.random.uniform(low=0.0, high=prob)
    #Normalize the array:
    p_g = p_g/np.sum(p_g)
    #Create dictionary
    gender_prob_dict = {}
    for key,val in gender_dict['Gender'].items():
        gender_prob_dict[key] = p_g[val]
    return gender_prob_dict

def pain_probability(seed,patient):
    np.random.seed(seed)
    test = infer.query(['Pain_Score'], evidence={'Age': patient['Age'], 'Gender': patient['Gender']},show_progress=False)
    pain_prob = test.values
    pain_dict = model.get_cpds("Pain_Score").name_to_no['Pain_Score']
    p_p = np.zeros(4)
    for key,val in pain_dict.items():
        prob = pain_prob[val]
        #Draw samples from uniform distribution defined by probabilities:
        p_p[val] = np.random.uniform(low=0.0, high=prob)
    #Normalize:
    p_p = p_p/np.sum(p_p)
    #Create dictionary
    pain_prob_dict = {}
    for key,val in pain_dict.items():
        pain_prob_dict[key] = p_p[val]
    return pain_prob_dict

def fatigue_probability(seed,patient):
    np.random.seed(seed)
    test = infer.query(['Fatigue_Score'], evidence={'Age': patient['Age'], 'Gender': patient['Gender']},show_progress=False)
    fatigue_prob = test.values
    fatigue_dict = model.get_cpds("Fatigue_Score").name_to_no['Fatigue_Score']
    p_f = np.zeros(4)
    for key,val in fatigue_dict.items():
        prob = fatigue_prob[val]
        #Draw samples from uniform distribution defined by probabilities:
        p_f[val] = np.random.uniform(low=0.0, high=prob)
    #Normalize:
    p_f = p_f/np.sum(p_f)
    #Create dictionary
    fatigue_prob_dict = {}
    for key,val in fatigue_dict.items():
        fatigue_prob_dict[key] = p_f[val]
    return fatigue_prob_dict


def get_prob_list(seed,valid_states):
    #Initialize:
    total_valid_states = 96
    prob_list = np.zeros((total_valid_states))
    #Compute Independent Probabilities:
    age_prob_dict = age_probability(seed)
    gender_prob_dict = gender_probability(seed)

    valid_state_dict = valid_states.to_dict(orient='records')
    for idx in range (total_valid_states):
        patient = valid_state_dict[idx]
        
        #Compute conditional probabilities:
        pain_prob_dict = pain_probability(seed,patient)
        fatigue_prob_dict = fatigue_probability(seed,patient)

        #Get values:
        p_a = age_prob_dict[patient['Age']]
        p_g = gender_prob_dict[patient['Gender']]
        cp_f = fatigue_prob_dict[patient['Fatigue_Score']]
        cp_p = fatigue_prob_dict[patient['Pain_Score']]

        prob_list[idx] = p_a * p_g * cp_p * cp_f
    return prob_list

def sample_ISP(all_states, prob_list, index_list):
    for i in range(96):
        all_states.loc[index_list[i], 'Probability'] = prob_list[i] 
    return np.array(all_states['Probability'])
     

### Create Bayesian Network

In [12]:
import pickle
with open('../MS_normalized_df.pkl', 'rb') as f:
    data_full = pickle.load(f)

train_df = wake_train(data_full)
list_of_tup = [('Gender', 'Pain_Score'),('Gender', 'Fatigue_Score'),
              ('Age', 'Pain_Score'), ('Age', 'Fatigue_Score'),
              ]
               
model = BayesianModel(list_of_tup)
%matplotlib notebook
#Draw
# nx.draw(model, with_labels=True)
# plt.show()
pos = graphviz_layout(model, prog = 'dot')
nx.draw(model,with_labels=True, pos = pos)
model.fit(train_df, estimator=BayesianEstimator)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


<IPython.core.display.Javascript object>

In [4]:
infer = VariableElimination(model)

### Total 3072 States

In [5]:
all_states = pd.read_csv('../data/all_states_df.csv')
all_states

Unnamed: 0,Gender:Male,Gender:Female,Age:Younger Than 30,Age:Between 30 To 60,Age:60 And Older,Current Daytime Interval:Wake,Current Daytime Interval:Morning,Current Daytime Interval:Afternoon,Current Daytime Interval:Evening,Current Daytime Interval:Bed,...,Current Interval Fatigue Score:Not Recorded,Last Interval Activity Bouts:Lower,Last Interval Activity Bouts:Normal,Last Interval Activity Bouts:Higher,Last Interval Activity Bouts:Not Recorded,Eod Positive Affect And Well-Being:Lower,Eod Positive Affect And Well-Being:Normal,Eod Positive Affect And Well-Being:Higher,Eod Positive Affect And Well-Being:Not Recorded,Probability
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3067,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3068,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3069,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3070,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [26]:
gender_cpds = model.get_cpds("Age")
print(gender_cpds)

+-----------------------+----------+
| Age(60 and Older)     | 0.104332 |
+-----------------------+----------+
| Age(Between 30 to 60) | 0.772767 |
+-----------------------+----------+
| Age(Younger than 30)  | 0.1229   |
+-----------------------+----------+


In [34]:
gender_cpds = model.get_cpds("Pain_Score").name_to_no['Pain_Score']
gender_cpds

{'Higher': 0, 'Lower': 1, 'Normal': 2, 'Not Recorded': 3}

In [37]:
test = infer.query(['Pain_Score'], evidence={'Age': 'Younger than 30', 'Gender': 'Male'},show_progress=False)
test.values

array([0.10114504, 0.10114504, 0.42175573, 0.3759542 ])

In [38]:
test = infer.query(['Pain_Score'], evidence={'Age': '60 and Older', 'Gender': 'Male'},show_progress=False)
test.values

array([0.08953488, 0.39651163, 0.50813953, 0.00581395])

### Valid 96 States

In [6]:
valid_states = pd.read_csv('../data/categorical_joint_dist_df.csv')
valid_states['Phi'] = 0.0
valid_states

Unnamed: 0,Pain_Score,Age,Gender,Fatigue_Score,Phi
0,Higher,60 and Older,Female,Higher,0.0
1,Higher,60 and Older,Female,Lower,0.0
2,Higher,60 and Older,Female,Normal,0.0
3,Higher,60 and Older,Female,Not Recorded,0.0
4,Higher,60 and Older,Male,Higher,0.0
...,...,...,...,...,...
91,Not Recorded,Younger than 30,Female,Not Recorded,0.0
92,Not Recorded,Younger than 30,Male,Higher,0.0
93,Not Recorded,Younger than 30,Male,Lower,0.0
94,Not Recorded,Younger than 30,Male,Normal,0.0


### Generate multiple samples

In [9]:
index_list = np.load("../data/index_list.npy")

In [10]:
n_samples = 3
for i in range(n_samples):
    seed = i
    prob_list = get_prob_list(seed,valid_states)
#     array_3072 = sample_ISP(all_states, prob_list, index_list)
    np.save("../ISP_samples/sample_{}.npy".format(i),prob_list)
    
#Store in 2D Matrix

### Test

In [11]:
for i in range(3):
    sample = np.load("../ISP_samples/sample_{}.npy".format(i))
    print(np.sum(sample))

0.9999999999999999
0.9999999999999998
1.0


In [15]:
sample = np.load("../ISP_samples/sample_{}.npy".format(1))
sample[0]

4.5460801342569186e-10

In [18]:
sample[95]

0.019831171321754004