In [1]:
import os 
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable as V
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import logistic_mf as lmf
import random

In [2]:
####load the matrices with ccs and 
data_path = r'/local/home/papageoa/data/mimic/matrix_factorization'


df = pd.read_csv(os.path.join(data_path, 'mf_val.csv.gz'), compression = 'gzip', index_col = 0)
df_train = pd.DataFrame(0, index = np.arange(df.shape[0]), columns = np.arange(df.shape[1]))
df_test = pd.DataFrame(0, index = np.arange(df.shape[0]), columns = np.arange(df.shape[1]))


#### create a mask to split in test and train
mask_test = np.zeros(df.shape, dtype = bool)

for i in range(mask_test.shape[0]):
    z_scores = df.iloc[i].to_numpy()
    candidates = np.where(z_scores != 0)[0]
    experiments = int(np.ceil(0.15 * candidates.shape[0]))
    test_idx = np.random.choice(candidates, experiments, replace = False)
    test_idx
    mask_test[i,test_idx] = 1

df_mask = pd.DataFrame(mask_test)


for i in range(mask_test.shape[0]):
    for j in range(mask_test.shape[1]):
        if mask_test[i,j] == 0:
            df_train.iloc[i,j] = df.iloc[i,j]
        elif mask_test[i,j] == 1:
            df_test.iloc[i,j] = df.iloc[i,j]
        else:
            raise Exception("mask has value {}, but it should have only 0,1".format(mask_test[i,j]))


x_train = df_train.stack().reset_index().rename(columns={'level_0':'ccs','level_1':'item', 0:'rating'}).to_numpy()
x_test = df_test.stack().reset_index().rename(columns={'level_0':'ccs','level_1':'item', 0:'rating'}).to_numpy()

In [3]:
keep_first = 150
keep_last = 150
correlation_matrix = df_train.corr()
single_rules = []
### loop over the ccs codes
for i in range(df_train.shape[0]):
    #### find all_non_zeros elements per ccs code 
    idx = df_train.iloc[i][df_train.iloc[i] != 0]
    #### create the pair-rules
    for j in range(idx.shape[0]):
        for k in range(j + 1, idx.shape[0]):
            tmp_corr = correlation_matrix.iloc[j,k]
            if not pd.isnull(tmp_corr):
                single_rules.append(lmf.rule(i, idx.index[j] * idx.values[j], idx.index[k] * idx.values[k], tmp_corr))
                

single_rules.sort(key=lambda x: x.corr, reverse=True)
#single_rules = single_rules[:keep_first] + single_rules[-keep_last:]

In [14]:
for rule in single_rules:
    rule.print_rule()
print(len(single_rules))

existing rule : ccs 0 m-405 => m-433
existing rule : ccs 4 m333 => m-343
existing rule : ccs 6 m387 => m-441
existing rule : ccs 9 m-350 => m373
existing rule : ccs 10 m-401 => m428
existing rule : ccs 13 m-245 => m-247
existing rule : ccs 17 m201 => m272
existing rule : ccs 18 m273 => m274
existing rule : ccs 19 m402 => m403
existing rule : ccs 20 m403 => m-405
existing rule : ccs 22 m325 => m-326
existing rule : ccs 23 m281 => m-282
existing rule : ccs 24 m368 => m438
existing rule : ccs 26 m394 => m398
existing rule : ccs 31 m-373 => m-388
existing rule : ccs 34 m-348 => m349
existing rule : ccs 35 m402 => m406
existing rule : ccs 36 m-348 => m-350
existing rule : ccs 37 m340 => m354
existing rule : ccs 38 m219 => m-221
existing rule : ccs 41 m368 => m375
existing rule : ccs 44 m215 => m218
existing rule : ccs 47 m417 => m-432
existing rule : ccs 49 m371 => m-407
existing rule : ccs 50 m-291 => m293
existing rule : ccs 51 m-291 => m294
existing rule : ccs 55 m152 => m180
existing ru

In [4]:
used_model = 'logistic_mf'
num_ccs = 281
num_items = 450
num_factors = 40
lr = 0.01
alpha = 0.005

### trian the model for one epoch
matrix_fact = lmf.logic_rules_injection(x_train, x_test, used_model, num_ccs, num_items, num_factors, lr = lr, alpha = alpha)
matrix_fact.train_model(epochs = 10)

#### print train accuracy and AUC
train_idx = np.where(x_train[:,2] != 0)[0]
x_train_tensor = torch.from_numpy(x_train[train_idx]).long().cuda()
matrix_fact.accuracy_statistics(x_train_tensor)
matrix_fact.print_AUC(x_train_tensor)

#### print test accuracy and AUC
test_idx = np.where(x_test[:,2] != 0)[0]
x_test_tensor = torch.from_numpy(x_test[test_idx]).long().cuda()
counts, accuracies = matrix_fact.accuracy_statistics(x_test_tensor)
matrix_fact.print_AUC(x_test_tensor)

0.8337801608579088
[[ 875  272]
 [ 224 1613]]


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


0.9200603311092012
0.7381294964028777
[[172  95]
 [ 87 341]]
0.8162343799222934


In [5]:
used_model = 'logistic_mf'
num_ccs = 281
num_items = 450
num_factors = 40
lr = 0.01
alpha = 0.005
n = 20


### trian the model for one epoch
matrix_fact = lmf.logic_rules_injection(x_train, x_test, used_model, num_ccs, num_items, num_factors, lr = lr, alpha = alpha)
matrix_fact.train_model(epochs = 2)

for epoch in range(8):
    matrix_fact.train_model(epochs = 1)
    matrix_fact.inject_single_rules(single_rules, n)

#### print train accuracy and AUC
train_idx = np.where(x_train[:,2] != 0)[0]
x_train_tensor = torch.from_numpy(x_train[train_idx]).long().cuda()
matrix_fact.accuracy_statistics(x_train_tensor)
matrix_fact.print_AUC(x_train_tensor)

#### print test accuracy and AUC
test_idx = np.where(x_test[:,2] != 0)[0]
x_test_tensor = torch.from_numpy(x_test[test_idx]).long().cuda()
counts_rules, accuracies_rules = matrix_fact.accuracy_statistics(x_test_tensor)
matrix_fact.print_AUC(x_test_tensor)



0.7919847328244275
[[ 874    0  273]
 [  60    0  100]
 [ 221    0 1616]]
0.9199488001883211
0.7381294964028777
[[172  95]
 [ 87 341]]
0.8164268962861844


In [18]:
injections = matrix_fact.x_train[matrix_fact.x_train[:,2] != 0]
injections = injections[injections[:,2] != 1]
injections = injections[injections[:,2] != -1] 
np.unique(injections[:,1])

array([ 97., 120., 124., 189., 280., 289., 293., 295., 316., 317., 321.,
       333., 348., 387.])

In [10]:
injected_rules = matrix_fact.injected_rules
injected_rules.sort(key=lambda x: x.corr, reverse=True)
pi = 1
i = 0
while pi > 0.98:
    injected_rules[i].print_rule()
    i +=1
    pi = injected_rules[i].corr


existing rule : ccs 73 m294 => m97
existing rule : ccs 222 m371 => m295
existing rule : ccs 18 m-283 => m97
existing rule : ccs 18 m-360 => m289
existing rule : ccs 64 m-381 => m289
existing rule : ccs 227 m201 => m289
existing rule : ccs 137 m-321 => m289
existing rule : ccs 185 m-290 => m97
existing rule : ccs 222 m371 => m289
existing rule : ccs 227 m201 => m97


In [11]:
accuracies - accuracies_rules

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  