### Libraries

In [1]:
import pickle
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator
import networkx as nx
import pylab as plt
from pgmpy.inference import VariableElimination
import pdb
import seaborn as sns
import matplotlib.pyplot as plt   
from sklearn.metrics import confusion_matrix
from networkx.drawing.nx_agraph import graphviz_layout
import numpy as np
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score, accuracy_score

### Functions

In [72]:
def joint_probability_df(model,query):
    '''
    Accepts query as input and computes the joint probability distribution
    P(Age,Gender,Pain,Fatigue)
    Returns dataframe of 96 valid states and their corresponding probabilities
    '''
    
    #Nested dictionary:
    nested_dict = model.get_cpds("Fatigue_Score").name_to_no

    age_dict = dict((v,k) for k,v in nested_dict["Age"].items())
    gender_dict = dict((v,k) for k,v in nested_dict["Gender"].items())
    fatigue_dict = dict((v,k) for k,v in nested_dict["Fatigue_Score"].items())
    
    nested2 = model.get_cpds("Pain_Score").name_to_no
    pain_dict = dict((v,k) for k,v in nested2["Pain_Score"].items())

    col = query.variables
    age,pain,gender,fatigue = query.values.shape
    joint_proba_df = pd.DataFrame(columns = col)
    index = 0
    predict_dict = {}
    
    for a in range(age):
        for p in range(pain):
            for g in range(gender):
                for f in range(fatigue):
                    joint_proba_df.loc[index, col[0]] = a
                    joint_proba_df.loc[index, col[1]] = p
                    joint_proba_df.loc[index, col[2]] = g
                    joint_proba_df.loc[index, col[3]] = f
                    joint_proba_df.loc[index, "Phi"] = query.values[a, p, g, f]
                    index+=1

    joint_proba_df["Age"].replace(age_dict, inplace=True)
    joint_proba_df["Pain_Score"].replace(pain_dict, inplace=True)
    joint_proba_df["Gender"].replace(gender_dict, inplace=True)
    joint_proba_df["Fatigue_Score"].replace(fatigue_dict, inplace=True)

    return joint_proba_df


def all_states(state_feature,state_feature_name):
    '''
    Accepts numpy arrays of column names and one-hot-encoding matrix of all 3072 states
    Returns an assembled dataframe
    '''
    col = np.append(state_feature_name, 'Probability')
    prob = np.zeros((state_feature.shape[0],1))
    data = np.hstack((state_feature,prob))

    state_distribution_df = pd.DataFrame(data=data, columns = col)
    return state_distribution_df

def OHE_jointprob(joint_proba_df,new_columns,original_columns):
    '''
    Accepts joint probability dataframe.
    Converts categorical variables into one-hot-encoded columns of dataframe. 
    Sets proper column names. 
    Returns OHE converted dataframe of states. Returns joint probabilities as list
    '''
    one_hot_df = pd.get_dummies(joint_proba_df, prefix_sep=':',dtype=float)
    rename = dict(zip(new_columns,original_columns))
    reorder_df = one_hot_df.reindex(columns=new_columns)
    prob_list = reorder_df['Phi'].tolist()
    rename_df = reorder_df.rename(columns=rename)
    ohe_df = rename_df.drop(labels='Probability', axis=1)
    
    return ohe_df, prob_list
    
def get_ISP(state_distribution_df , ohe_joint, prob_list):
    '''
    The state_distribution dataframe and the OHE joint probability dataframe both have
    unique rows. However, the order of the rows is not the same.
    This code accepts the two dataframes and matches row-wise to get correct sequence of rows.
    Code returns the 3072 array of initial state probabilites, arranged by correct sequence.  
    
    '''
    index_list = np.zeros((96))
    
    wake_df = state_distribution_df.loc[state_distribution_df['Current Daytime Interval:Wake'] == 1.0]
    wake_df =  wake_df.loc[wake_df['Last Interval Activity Bouts:Not Recorded'] == 1.0]     
    wake_reduced_df = wake_df[original_columns[:-1]] 
    
#     df2 = ohe_joint.copy()
    df1 = wake_reduced_df.copy()
    
    state_copy = state_distribution_df.copy()
    for idx in range(1,97): #96 valid states
        df2 =  ohe_joint.copy()[idx-1:idx]
        #Returns the indices of df1:
        df = pd.concat([df1, df2])

        df = df.reset_index(drop=True)

        df_gpby = df.groupby(list(df.columns))

        idx = [x[0] for x in df_gpby.groups.values() if len(x) != 1]
        match_df = df.reindex(idx)
        state_copy.loc[match_df.index[0], 'Probability'] = prob_list[df2.index[0]] 
        index_list[idx] = df2.index[0]
        
    computed_ISP = np.array(state_copy['Probability'])
    
    return computed_ISP,index_list
    

### Preparing Training Data

In [27]:
# #Load the processed and normalized  dataset 
# import pickle
# with open('../data/training_data_bayessian_model.pkl', 'rb') as f:
#     data_full = pickle.load(f)
# data_full

In [24]:
#Load the processed and normalized  dataset 
import pickle
with open('../data/training_population_data_IRL.pkl', 'rb') as f:
    data_full = pickle.load(f)
data_full

In [26]:
initial_state_df = data_full.loc[data_full['Patient_Interval_Number'] == 1.0]
initial_state_df

Unnamed: 0,Patient_ID,Gender,Age,Daytime_Interval,Patient_Interval_Number,Pain_Score,Action_Pain,Fatigue_Score,Action_Fatigue,Last_Activitybout_State,Current_Activitybout_Action,Action_PAW,EOD_PAW
0,1001,Female,Between 30 to 60,Wake,1.0,Low,Recorded,Low,Recorded,Not Recorded,Higher,,
30,1002,Female,Between 30 to 60,Wake,1.0,Low,Recorded,Low,Recorded,Not Recorded,Higher,,
60,1003,Male,Between 30 to 60,Wake,1.0,Low,Recorded,Low,Recorded,Not Recorded,Higher,,
90,1004,Female,Between 30 to 60,Wake,1.0,,Recorded,Medium,Recorded,Not Recorded,Normal,,
120,1005,Female,Between 30 to 60,Wake,1.0,Medium,Recorded,Medium,Recorded,Not Recorded,Normal,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,1106,Female,Between 30 to 60,Wake,1.0,Low,Recorded,Low,Recorded,Not Recorded,Normal,,
3090,1107,Female,Between 30 to 60,Wake,1.0,Low,Recorded,Low,Recorded,Not Recorded,Normal,,
3120,1108,Male,Between 30 to 60,Wake,1.0,Medium,Recorded,,Recorded,Not Recorded,Lower,,
3150,1109,Female,Between 30 to 60,Wake,1.0,Medium,Recorded,Medium,Recorded,Not Recorded,Normal,,


In [29]:
initial_state_df.columns

Index(['Patient_ID', 'Gender', 'Age', 'Daytime_Interval',
       'Patient_Interval_Number', 'Pain_Score', 'Action_Pain', 'Fatigue_Score',
       'Action_Fatigue', 'Last_Activitybout_State',
       'Current_Activitybout_Action', 'Action_PAW', 'EOD_PAW'],
      dtype='object')

In [31]:
trim_df = initial_state_df[['Gender', 'Age', 'Daytime_Interval',
                            'Pain_Score','Fatigue_Score','Last_Activitybout_State',
                            'EOD_PAW']]

trim_df.columns

Index(['Gender', 'Age', 'Daytime_Interval', 'Pain_Score', 'Fatigue_Score',
       'Last_Activitybout_State', 'EOD_PAW'],
      dtype='object')

In [32]:
rename_df = trim_df.rename(columns={'Daytime_Interval': 'Current Daytime Interval', 
                                    'Pain_Score':'Current Pain',
                                    'Fatigue_Score':'Current Fatigue',
                                    'Last_Activitybout_State': 'Last Acbouts',
                                    'EOD_PAW':'Current EOD PAW'})

In [33]:
rename_df

Unnamed: 0,Gender,Age,Current Daytime Interval,Current Pain,Current Fatigue,Last Acbouts,Current EOD PAW
0,Female,Between 30 to 60,Wake,Low,Low,Not Recorded,
30,Female,Between 30 to 60,Wake,Low,Low,Not Recorded,
60,Male,Between 30 to 60,Wake,Low,Low,Not Recorded,
90,Female,Between 30 to 60,Wake,,Medium,Not Recorded,
120,Female,Between 30 to 60,Wake,Medium,Medium,Not Recorded,
...,...,...,...,...,...,...,...
3060,Female,Between 30 to 60,Wake,Low,Low,Not Recorded,
3090,Female,Between 30 to 60,Wake,Low,Low,Not Recorded,
3120,Male,Between 30 to 60,Wake,Medium,,Not Recorded,
3150,Female,Between 30 to 60,Wake,Medium,Medium,Not Recorded,


In [55]:
#State features
GENDER = ["Male", "Female"] #Dem_02
AGE = ["Younger than 30", "Between 30 to 60", "60 and Older"] #Dem_01
DAYTIME_INTERVAL = ["Wake", "Morning", "Afternoon", "Evening", "Bed"] #Morning: 11am, Afternoon: 3pm, Evening: 7pm
## Lower: below 33th percentile; Normal: 33rd to 66th percentile; Higher: Above 66th percentile
PAIN = ["None", "Low", "Medium", "High", "Not Recorded"] #"None" means No_Pain, 1<= Low <=3, 4<= Medium <= 7, 8<= High <= 10
FATIGUE = ["None", "Low", "Medium", "High", "Not Recorded"] #"None" means No_Fatigue,  
LAST_ACTIVITYBOUTS = ["Lower", "Normal", "Higher", "Not Recorded"] #used percentile method
### 10<= Moderate <=19, 20<= Mild <= 25, Normal >= 26 #Not Applicable
EOD_PAW = ["None", "Moderate", "Mild", "Normal", "Not Recorded"] 


states = {}
state_name_index = 1
import itertools
for x in itertools.product(GENDER, AGE, DAYTIME_INTERVAL, PAIN, FATIGUE, LAST_ACTIVITYBOUTS, EOD_PAW):
    if(not(x[2] == "Bed") and not(x[6] == "None")):
        continue
    if(x[2] == "Bed" and x[6] == "None"):
        continue
    #print(x)
    state = {}
    state["Gender"] = x[0]
    state["Age"] = x[1]
    state["Current Daytime Interval"] = x[2]
    state["Current Pain"] = x[3]
    state["Current Fatigue"] = x[4]
    state["Last Acbouts"] = x[5]
    #state["Last Night EOD PAW"] = x[6]
    state["Current EOD PAW"] = x[6]
    state_code = 'S' + str(state_name_index)
    states[state_code] = state
    state_name_index += 1
    

In [56]:
states

{'S1': {'Gender': 'Male',
  'Age': 'Younger than 30',
  'Current Daytime Interval': 'Wake',
  'Current Pain': 'None',
  'Current Fatigue': 'None',
  'Last Acbouts': 'Lower',
  'Current EOD PAW': 'None'},
 'S2': {'Gender': 'Male',
  'Age': 'Younger than 30',
  'Current Daytime Interval': 'Wake',
  'Current Pain': 'None',
  'Current Fatigue': 'None',
  'Last Acbouts': 'Normal',
  'Current EOD PAW': 'None'},
 'S3': {'Gender': 'Male',
  'Age': 'Younger than 30',
  'Current Daytime Interval': 'Wake',
  'Current Pain': 'None',
  'Current Fatigue': 'None',
  'Last Acbouts': 'Higher',
  'Current EOD PAW': 'None'},
 'S4': {'Gender': 'Male',
  'Age': 'Younger than 30',
  'Current Daytime Interval': 'Wake',
  'Current Pain': 'None',
  'Current Fatigue': 'None',
  'Last Acbouts': 'Not Recorded',
  'Current EOD PAW': 'None'},
 'S5': {'Gender': 'Male',
  'Age': 'Younger than 30',
  'Current Daytime Interval': 'Wake',
  'Current Pain': 'None',
  'Current Fatigue': 'Low',
  'Last Acbouts': 'Lower',
  

In [57]:
def state_to_state_code(state):
    #input: state
    #output: state_code
    return list(states.keys())[list(states.values()).index(state)]

In [63]:
s_codes = ['0']*len(rename_df.index)
for i in range(1,len(rename_df.index)+1):
    df = rename_df[i-1:i]
    state_dict = df.to_dict('records')[0]
    s_codes[i-1] = state_to_state_code(state_dict)
print(s_codes)

['S3228', 'S3228', 'S828', 'S3212', 'S3252', 'S3224', 'S3232', 'S832', 'S3228', 'S3224', 'S3208', 'S3224', 'S4', 'S804', 'S848', 'S3236', 'S3232', 'S804', 'S3232', 'S3252', 'S3204', 'S3224', 'S4004', 'S3228', 'S2448', 'S2468', 'S804', 'S3232', 'S3252', 'S1628', 'S3228', 'S3208', 'S3204', 'S4008', 'S832', 'S3224', 'S804', 'S832', 'S4028', 'S3208', 'S3224', 'S3224', 'S3204', 'S3232', 'S2412', 'S4004', 'S3204', 'S3204', 'S3252', 'S3300', 'S2452', 'S2408', 'S3224', 'S804', 'S852', 'S3300', 'S3208', 'S100', 'S2456', 'S3208', 'S3300', 'S812', 'S3236', 'S3300', 'S852', 'S3228', 'S2500', 'S1632', 'S900', 'S3228', 'S3232', 'S3256', 'S3228', 'S1604', 'S836', 'S832', 'S856', 'S1624', 'S3252', 'S2424', 'S32', 'S3204', 'S824', 'S3208', 'S3204', 'S852', 'S2428', 'S828', 'S3252', 'S4008', 'S3208', 'S3232', 'S3204', 'S804', 'S804', 'S1608', 'S3228', 'S2500', 'S828', 'S824', 'S4004', 'S3228', 'S3228', 'S3228', 'S844', 'S3252', 'S3300']


#### 1. All Daytime Intervals

In [12]:
# training_df = data_full[['Gender', 'Age','Pain_Score','Fatigue_Score']]
# training_df

#### 2. Only wake state

In [13]:
wake_df = data_full[['Gender', 'Age','Pain_Score','Fatigue_Score','Daytime_Interval']]

#Drop Daytime_Intervals != Wake
indexNames = wake_df[ wake_df['Daytime_Interval'] != 'Wake' ].index
wake_df.drop(indexNames , inplace=True)


# wake_df = wake_df[['Gender', 'Age','Pain_Score','Fatigue_Score']]
wake_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Gender,Age,Pain_Score,Fatigue_Score,Daytime_Interval
0,Female,Between 30 to 60,Low,Low,Wake
5,Female,Between 30 to 60,Low,Low,Wake
10,Female,Between 30 to 60,Low,Low,Wake
15,Female,Between 30 to 60,Low,Low,Wake
20,Female,Between 30 to 60,Low,Low,Wake
...,...,...,...,...,...
3185,Female,Between 30 to 60,Low,,Wake
3190,Female,Between 30 to 60,Not Recorded,Not Recorded,Wake
3195,Female,Between 30 to 60,Medium,Medium,Wake
3200,Female,Between 30 to 60,Medium,Medium,Wake


### Creating Bayesian Model

In [None]:
# list_of_tup = [("Gender","Pain_Score"),  ("Gender","Fatigue_Score"),
#                ("Age", "Pain_Score"),    ("Age","Fatigue_Score"),
#                ("Pain_Score","Fatigue_Score")]


# list_of_tup = [('Gender', 'Current_Pain'),('Gender', 'Current_Fatigue'),
#               ('Age', 'Current_Pain'), ('Age', 'Current_Fatigue'),
#               ('Current_Daytime_Interval', 'Current_Pain'),('Current_Daytime_Interval', 'Current_Fatigue'),
#               ]


list_of_tup = [('Gender', 'Pain_Score'),('Gender', 'Fatigue_Score'),
              ('Age', 'Pain_Score'), ('Age', 'Fatigue_Score'),
              ]
               
model = BayesianModel(list_of_tup)
%matplotlib notebook
#Draw
# nx.draw(model, with_labels=True)
# plt.show()
pos = graphviz_layout(model, prog = 'dot')
nx.draw(model,with_labels=True, pos = pos)

### Training Model

#### 1. On all daytime intervals

In [None]:
# model.fit(training_df, estimator=BayesianEstimator)
# cpd_pain = model.get_cpds('Pain_Score')
# cpd_fatigue = model.get_cpds('Fatigue_Score')

#### 2. Only on 'wake' daytime interval

In [None]:
model.fit(wake_df, estimator=BayesianEstimator)

### Validation using joint probability

In [None]:
infer = VariableElimination(model)

In [None]:
test = infer.query(['Gender','Age','Pain_Score', "Fatigue_Score"], joint = True, show_progress=False)

### CPDS for all variables

In [None]:
print(model.get_cpds("Age"))

In [None]:
print(model.get_cpds("Gender"))

### Dataframe of Joint Probabilities

In [None]:
joint_proba_df = joint_probability_df(model,test)
joint_proba_df
# joint_proba_df.to_csv('../data/categorical_joint_dist_df.csv',index=False)

### All States Dataframe

In [None]:
initial_state_dist = np.load('../data/initial_state_dist.npy')
state_feature = np.load('../data/state_feature.npy')
state_feature_name = np.load('../data/state_feature_name.npy')

In [None]:
state_distribution_df = all_states(state_feature,state_feature_name)
state_distribution_df
state_distribution_df.to_csv('../data/all_states_df.csv',index=False)
state_distribution_df

### One Hot Encoding of Joint Probability Dataframe

In [None]:
original_columns = ['Gender:Male', 'Gender:Female',
 
 'Age:Younger Than 30','Age:Between 30 To 60', 'Age:60 And Older',
 
 'Current Interval Pain Score:Lower', 'Current Interval Pain Score:Normal',
 'Current Interval Pain Score:Higher','Current Interval Pain Score:Not Recorded',
 
 'Current Interval Fatigue Score:Lower','Current Interval Fatigue Score:Normal',
 'Current Interval Fatigue Score:Higher','Current Interval Fatigue Score:Not Recorded',
                    
                    'Probability'
]

new_columns = ['Gender:Male', 'Gender:Female',
 
 'Age:Younger than 30','Age:Between 30 to 60', 'Age:60 and Older',
 
 'Pain_Score:Lower', 'Pain_Score:Normal',
 'Pain_Score:Higher','Pain_Score:Not Recorded',
 
 'Fatigue_Score:Lower','Fatigue_Score:Normal',
 'Fatigue_Score:Higher','Fatigue_Score:Not Recorded',
               'Phi'
]

In [None]:
ohe_joint, prob_list = OHE_jointprob(joint_proba_df,new_columns,original_columns)

In [None]:
ohe_joint.to_csv('../data/joint_distribution_OHE.csv',index=False)

In [None]:
ohe_joint

### Get array of initial state probabilities

In [None]:
computed_ISP, index_list = get_ISP(state_distribution_df , ohe_joint, prob_list)

In [None]:
index_list

In [None]:
np.sum(computed_ISP)

In [None]:
np.save('../data/index_list.npy', index_list)

### Create Dataframe

In [67]:
PATIENT_ID = []   #total 20 patients , missing patient_id: 1017, 1021; last patient id:1008
for i in range(1001, 1111, 1): #original 1111
    if(i== 1017 or i == 1021 or i == 1051):
        pass
    else:
        PATIENT_ID.append(str(i))

In [68]:
n = len(PATIENT_ID)

In [69]:
def create_df(patient_list,state_codes,n):
    probability_list = [0]*n
    data = list(zip(patient_list, state_codes, probability_list))
    df = pd.DataFrame(data=data,columns =['PATIENT_ID','STATE_CODE','PROBABILITY'])
    return df

In [70]:
df = create_df(PATIENT_ID,s_codes,n)

In [71]:
df

Unnamed: 0,PATIENT_ID,STATE_CODE,PROBABILITY
0,1001,S3228,0
1,1002,S3228,0
2,1003,S828,0
3,1004,S3212,0
4,1005,S3252,0
...,...,...,...
102,1106,S3228,0
103,1107,S3228,0
104,1108,S844,0
105,1109,S3252,0
