### Bayesian Networks ###
For now, I only implemented K2 algorithm as the scoring function in categorial datasets.
Please find more information about pgmpy package: http://pgmpy.org/index.html

In [53]:
import pandas as pd
import numpy as np
from math import lgamma, log
from collections import Counter
from pgmpy.models import BayesianModel
from sklearn.preprocessing import LabelBinarizer
from pgmpy.estimators import MaximumLikelihoodEstimator, K2Score

### Structre Learning ###
learn model structure (a DAG) from a data set, there are two broad techniques:

* score-based structure learning
* constraint-based structure learning
The combination of both techniques allows further improvement:

* hybrid structure learning

#### Score-based Structure Learning ####
This approach construes model selection as an optimization task. It has two building blocks:

* A scoring function $s_D\colon M \to \mathbb R$ that maps models to a numerical score, based on how well they fit to a given data set $D$.
* A search strategy to traverse the search space of possible models $M$ and select a model with optimal score.

##### Scoring functions #####
Commonly used scores to measure the fit between model and data are Bayesian Dirichlet scores such as BDeu or K2 and the Bayesian Information Criterion (BIC, also called MDL). See [1], Section 18.3 for a detailed introduction on scores. As before, BDeu is dependent on an equivalent sample size.

In this code for now, I only implemented K2 algorithm as the scoring function

##### Only data site A has the target class #####
In this case, I would like to use a Trusted Third Party to do the final calculation. It is fine to do it at one of data parties, but the states table leaks some information about data (statistical summary level). If we do that in the TTP, then TTP does not know anything about data itself even not the column name. TTP only needs to do one calculation without knowing anything. 

In [20]:
# k2 = K2Score(data)
# model1 = BayesianModel([('race', 'num_procedures'), ('age', 'num_procedures')]) # race -> num_procedures <- age 
# print(k2.score(model1))

In [21]:
def give_state_names(df, feature):
    state_names = dict()
    if isinstance(feature, list): 
        for c in feature:
            values = list(Counter(df[c]).keys())
            values.sort()
            state_names[c] = values
    else:
        values = list(Counter(df[feature]).keys())
        values.sort()
        state_names[feature] = values
        
    return state_names

In [22]:
# convert target class to binary
def toBinary(unique_y, TargetClass):
    lb = LabelBinarizer()
    lb.fit(unique_y)
    conv_TargetClass = lb.transform(TargetClass)
    return conv_TargetClass

##### At Data Site A with Target Class #####

In [33]:
df_A = pd.DataFrame.from_csv('preprocessed_dataFile_A.csv').drop('num_lab_procedures', axis=1)
A_feature = 'num_procedures'
state_names_A = give_state_names(df_A, A_feature)
state_names_A

{'num_procedures': [0, 1, 2, 3, 4, 5, 6]}

##### At Data Site B without Target Class #####

In [28]:
df_B = pd.DataFrame.from_csv('preprocessed_dataFile_B.csv').drop(['diag_1','diag_2','diag_3'], axis=1)
B_feature = ['max_glu_serum', 'A1Cresult']
state_names_B = give_state_names(df_B, B_feature)
state_names_B

{'A1Cresult': [0, 1, 2, 3], 'max_glu_serum': [0, 1, 2, 3]}

In [37]:
conv_features_B = []
for b in range(0, len(B_feature)):
    conv_features_B.append(toBinary(state_names_B[B_feature[b]], df_B[B_feature[b]]))

In [41]:
# Assume B has two features
status_local_matrix = []
if len(conv_features_B) > 1:
    for row in range(0, len(conv_features_B[0])):
        status_local_matrix.append(list(np.concatenate(np.dot((conv_features_B[0][row][np.newaxis]).T, \
                                                  conv_features_B[1][row][np.newaxis]))))

### Secure scalar product ### 

status_local_matrix with noises has to be sent to Data Site A

In [42]:
##### At Data Site B (add noises to the matrix) #####
X_a = pd.DataFrame.from_records(status_local_matrix) # .transpose()
len_A = len(X_a.columns)

# Generate random numbers and add to data at Data Site A
A_randoms = []
for i in range(0, len_A):
    A_randoms.append(np.random.randint(0,5, len(X_a.iloc[:,i])))
    
C_matrix = [] # C_noises is shared between A and B 
for i in range(0, len_A):
    C_matrix.append(np.random.randint(0,5, (len(X_a.iloc[:,i]), len(X_a.iloc[:,i]))))

Sum_noises_A = [] # which will be sent to B
for i in range(0, len_A):
    Sum_noises_A.append(np.add(X_a.iloc[:,i], np.dot(C_matrix[i], A_randoms[i])))

##### Data Site A receives noised B_Matrix #####

In [43]:
X_b = pd.DataFrame.from_records(toBinary(state_names_A[A_feature], df_A[A_feature]))
len_B = len(X_b.columns)
B_divide_set = 10

Sum_coef_B = []
for i in range(0, len_B):
    Sum_noises_temp = []
    for j in range(0, len_A):
        Sum_noises_temp.append(np.dot(C_matrix[j].transpose(), X_b.iloc[:,i])) 
    Sum_coef_B.append(Sum_noises_temp)

B_random_set = []
for i in range(0, len_A):
#     np.random.seed(3)
    B_random_set.append(np.random.randint(0,5, int(len(X_b.iloc[:,0])/B_divide_set))) 

Sum_noises_B = [] # which will be send to A
for n in range(0, len_B):
    B_noise = []
    for i in range(0, len_A):
        B_random_inter = []
        for j in range(0, len(B_random_set[i])): 
            for k in range(0, B_divide_set):
                B_random_inter.append(B_random_set[i][j])
        B_noise.append(Sum_coef_B[n][i] + B_random_inter)
    Sum_noises_B.append(B_noise)

# Add noises dataset A to the dataset B
Sum_noises_AB = []
for i in range(0, len_B):
    Sum_noises_temp = []
    for j in range(0, len_A):
        Sum_noises_temp.append(np.dot(Sum_noises_A[j], X_b.iloc[:,i])) # X_b[:,i]
    Sum_noises_AB.append(Sum_noises_temp)

##### Back to Data Site B #####

In [44]:
A_randoms_Sumset = []
for i in range(0, len_A):
    sum_temp = []
    for j in range(0, int(len(X_a)/B_divide_set)):
        temp = 0
        for k in range(0, B_divide_set):
            temp = temp + A_randoms[i][B_divide_set*j + k]
        sum_temp.append(temp)
        
    A_randoms_Sumset.append(sum_temp)

    
Sum_noises_B_Arand = []
for n in range(0, len_B):
    temp = []
    for i in range(0, len_A):
        temp.append(np.dot(A_randoms[i],Sum_noises_B[n][i]))
    Sum_noises_B_Arand.append(temp)

##### At Data Site A to calculate final result #####

In [45]:
rand_sums = []
for i in range(0, len_A):
    r_sum = 0
    for j in range(0, len(B_random_set[0])):
        r_sum = r_sum + A_randoms_Sumset[i][j] * B_random_set[i][j]
    rand_sums.append(r_sum)

outcomes = []
for n in range(0, len_B):
    out = []
    for i in range(0, len_A):
        out.append(Sum_noises_AB[n][i] - Sum_noises_B_Arand[n][i] + rand_sums[i]) 
    outcomes.append(out)

In [46]:
header_list = []
for i in range(0, len(B_feature)):
    header_list.append(state_names_B[B_feature[i]])
header = pd.MultiIndex.from_product(header_list,names=B_feature)
outcomes_state_counts_df = pd.DataFrame.from_records(outcomes, columns=header)
outcomes_state_counts_df.index.name = A_feature

##### Final scoring #####
* The following part is suggested to be done at Trusted Third Party because the states table is released.
* If TTP does not exist, this can be done at Data Site A who has the target class. (comparing to Data Site B does the final calculation, A is safer)

In [52]:
var_states = state_names_A[A_feature]
var_cardinality = len(var_states)
state_counts = outcomes_state_counts_df

score = 0
for parents_state in outcomes_state_counts_df:  # iterate over df columns (only 1 if no parents)
    
    conditional_sample_size = sum(outcomes_state_counts_df[parents_state])
    score += lgamma(var_cardinality) - lgamma(conditional_sample_size + var_cardinality)

    for state in var_states:
        if outcomes_state_counts_df[parents_state][state] > 0:
            score += lgamma(outcomes_state_counts_df[parents_state][state] + 1)
print(score)

-5041.432981647914


In [51]:
# # Check with centralized K2 algorithm: 
# df_ctr = pd.concat([df_A, df_B], axis=1)
# k2 = K2Score(df_ctr)
# print(k2.local_score(A_feature, parents=B_feature))

-5041.432981647914
