In [1]:
import os,sys
import time,datetime
import GPy
import GPyOpt
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.special import expit
import pandas as pd

In [2]:
%run ../tools/TrainTest.py
%run ../tools/PCG3GM.py

In [3]:
# path information
trial_num = 5
tune_trial_num = 100
snapshot_path = '../result/snapshot/complement'
snapshot_form = 'PCG3-hyper6-EI5000w500-split{0}-{1}'
peer_review_csv = '../dataset/open_peer_review/peer_review/peer_review_forPG3_suffled.csv'

# functions
def timeStamp():
    # return time as str
    todaydetail = datetime.datetime.today()
    return todaydetail.strftime("%Y%m%d%H%M%S")

def saveGPyOpt(myBopt,fold_num):
    filename = snapshot_form.format(fold_num,timeStamp())
    path = os.path.join(snapshot_path, filename)
    np.savez_compressed(path, myBopt.X, myBopt.Y)
    print("Save: {}".format(path))

In [4]:
# preprocess train&test dataset
rDF = pd.read_csv(peer_review_csv)
np.random.seed(12345678)
shuffled_pattern = [np.random.permutation(len(rDF)) for i in xrange(trial_num)]
df = rDF.take(shuffled_pattern[0]).reset_index(drop=True)
train, val, test = splitDataset523(df)

In [5]:
pcg3gm = PCG3GM(train, val, test)
#pcg3gm.rmseForVal([2.66850218, 1.26965507, 43.3753167, 83.0844218 , 16.08320701, 69.77258816])

In [7]:
#pcg3gm.rmseForTest([2.66850218, 1.26965507, 43.3753167, 83.0844218 , 16.08320701, 69.77258816])

0.7764230031564122

In [6]:
#pcg3gm.aucForVal([2.66850218, 1.26965507, 43.3753167, 83.0844218 , 16.08320701, 69.77258816])

0.69818902263828919

### ↓making rmseForVal(self,hyper_list), aucForVal(self,hyper_list)

In [6]:
stan_fit = pcg3gm.fit([2.66850218, 1.26965507, 43.3753167, 83.0844218 , 16.08320701, 69.77258816])

In [7]:
stan_fit

Inference for Stan model: anon_model_a8999b7f418e7a1eff81e67dfbf27177.
4 chains, each with iter=5000; warmup=500; thin=1; 
post-warmup draws per chain=4500, total post-warmup draws=18000.

                   mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
ability[0]       1.8e-5  1.3e-7 1.8e-5 4.7e-7 5.1e-6 1.2e-5 2.4e-5 6.6e-5  18000    nan
ability[1]         2.58  5.3e-3   0.71   1.07    2.1   2.61   3.11   3.82  18000    1.0
ability[2]          2.0  1.7e-4   0.02   1.96   1.98    2.0   2.02   2.05  18000    1.0
ability[3]       9.1e-5  6.8e-7 9.2e-5 2.1e-6 2.6e-5 6.3e-5 1.3e-4 3.4e-4  18000    nan
ability[4]       2.6e-5  1.9e-7 2.6e-5 7.0e-7 7.2e-6 1.8e-5 3.5e-5 9.5e-5  18000    nan
ability[5]       4.9e-6  3.6e-8 4.8e-6 1.3e-7 1.5e-6 3.5e-6 6.7e-6 1.8e-5  18000    nan
ability[6]       1.1e-4  8.3e-7 1.1e-4 2.9e-6 3.3e-5 7.9e-5 1.5e-4 4.1e-4  18000    nan
ability[7]         2.59  5.3e-3   0.71   1.11    2.1   2.61   3.12   3.84  18000    1.0
ability[8]         

In [8]:
eap_value = stan_fit.summary()['summary'][:,0]

In [11]:
split = pcg3gm.userNum
ability = eap_value[0:split]
bias = eap_value[split:split*2]
noise = eap_value[split*2]
noise = eap_value[split*2+1:split*3+1]

In [12]:
noise

-1.9988032554678461

In [13]:
sender = pcg3gm.senderVal
receiver = pcg3gm.receiverVal

In [14]:
sender[0], receiver[0]

(96, 12)

In [15]:
val

Unnamed: 0,sender_id,receiver_id,corrected,value
0,96,12,1,3
1,247,151,0,3
2,30,192,0,3
3,21,67,0,4
4,3,227,1,3
5,57,34,0,4
6,165,200,1,1
7,170,140,1,3
8,247,376,1,3
9,34,269,1,1


In [16]:
sender

array([ 96, 247,  30, ..., 123,  34, 101])

In [17]:
valueEst = ability[receiver] + bias[sender]

In [18]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(pcg3gm.valueVal, valueEst))

In [19]:
rmse

0.7725239611753643

In [20]:
correctedEst = expit(ability[receiver]+bias[sender]+noise)

In [22]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(pcg3gm.correctedVal, correctedEst, pos_label=1)
auc = metrics.auc(fpr, tpr)

In [23]:
auc

0.69801748796040974

In [25]:
len(pcg3gm.correctedVal), len(correctedEst)

(1050, 1050)