In [1]:
import os,sys
import time,datetime
import GPy
import GPyOpt
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.special import expit
import pandas as pd

In [2]:
%run ../tools/TrainTest.py
%run ../tools/PCG1GM.py

In [3]:
# path information
trial_num = 5
tune_trial_num = 100
snapshot_path = '../result/snapshot/complement'
snapshot_form = 'PCG1-hyper6-EI5000w500-split{0}-{1}'
peer_review_csv = '../dataset/open_peer_review/peer_review/peer_review_forPG3_suffled.csv'

# functions
def timeStamp():
    # return time as str
    todaydetail = datetime.datetime.today()
    return todaydetail.strftime("%Y%m%d%H%M%S")

def saveGPyOpt(myBopt,fold_num):
    filename = snapshot_form.format(fold_num,timeStamp())
    path = os.path.join(snapshot_path, filename)
    np.savez_compressed(path, myBopt.X, myBopt.Y)
    print("Save: {}".format(path))

In [4]:
# preprocess train&test dataset
rDF = pd.read_csv(peer_review_csv)
np.random.seed(12345678)
shuffled_pattern = [np.random.permutation(len(rDF)) for i in xrange(trial_num)]
df = rDF.take(shuffled_pattern[0]).reset_index(drop=True)
train, val, test = splitDataset523(df)

In [5]:
pcg1gm = PCG1GM(train, val, test)
pcg1gm.rmseForVal([2.54785885, 4.62864443, 67.68716059, 22.95702442, 6.96275866, 56.26711153])

0.65955512481291989

In [6]:
pc1gm.rmseForTest([2.54785885, 4.62864443, 67.68716059, 22.95702442, 6.96275866, 56.26711153])

0.6495483660333945

### ↓making aucForVal(self,hyper_list)

In [7]:
stan_fit = pc1gm.fit([2.54785885, 4.62864443, 67.68716059, 22.95702442, 6.96275866, 56.26711153])

In [8]:
stan_fit

Inference for Stan model: anon_model_e5a6677c6d3fee842d9c276c0be51e16.
4 chains, each with iter=5000; warmup=500; thin=1; 
post-warmup draws per chain=4500, total post-warmup draws=18000.

               mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
ability[0]     0.24  2.6e-3   0.34  -0.43 2.8e-3   0.24   0.47    0.9  18000    1.0
ability[1]     0.38  2.7e-3   0.36  -0.33   0.13   0.38   0.62   1.08  18000    1.0
ability[2]     0.46  2.6e-3   0.35  -0.23   0.21   0.46    0.7   1.14  18000    1.0
ability[3]     0.52  2.2e-3   0.29  -0.05   0.33   0.52   0.72   1.09  18000    1.0
ability[4]     0.46  2.6e-3   0.35  -0.24   0.22   0.46   0.69   1.15  18000    1.0
ability[5]     0.47  2.5e-3   0.34  -0.19   0.25   0.47    0.7   1.13  18000    1.0
ability[6]     0.42  2.6e-3   0.35  -0.27   0.19   0.42   0.66   1.11  18000    1.0
ability[7]     0.37  2.7e-3   0.36  -0.33   0.13   0.37   0.61   1.09  18000    1.0
ability[8]     0.88  1.8e-3   0.25    0.4   0.72   0.89

In [9]:
eap_value = stan_fit.summary()['summary'][:,0]

In [10]:
split = pc1gm.userNum
ability = eap_value[0:split]
bias = eap_value[split:split*2]
noise = eap_value[split*2]

In [11]:
sender = pc1gm.senderVal
receiver = pc1gm.receiverVal

In [12]:
sender[0], receiver[0]

(96, 12)

In [13]:
val

Unnamed: 0,sender_id,receiver_id,corrected,value
0,96,12,1,3
1,247,151,0,3
2,30,192,0,3
3,21,67,0,4
4,3,227,1,3
5,57,34,0,4
6,165,200,1,1
7,170,140,1,3
8,247,376,1,3
9,34,269,1,1


In [14]:
sender

array([ 96, 247,  30, ..., 123,  34, 101])

In [15]:
correctedEst = expit(ability[receiver]+bias[sender]+noise)

In [16]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(pc1gm.correctedVal, correctedEst, pos_label=1)
auc = metrics.auc(fpr, tpr)

In [17]:
auc

0.65974381295858731

In [18]:
len(pc1gm.correctedVal), len(correctedEst)

(1050, 1050)