In [1]:
import os,sys
import time,datetime
import GPy
import GPyOpt
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.special import expit
import pandas as pd

In [2]:
%run ../tools/TrainTest.py
%run ../tools/PCG5GM.py

In [3]:
# path information
trial_num = 5
tune_trial_num = 100
snapshot_path = '../result/snapshot/complement'
snapshot_form = 'PCG5-hyper6-EI5000w500-split{0}-{1}'
peer_review_csv = '../dataset/open_peer_review/peer_review/peer_review_forPG3_suffled.csv'

# functions
def timeStamp():
    # return time as str
    todaydetail = datetime.datetime.today()
    return todaydetail.strftime("%Y%m%d%H%M%S")

def saveGPyOpt(myBopt,fold_num):
    filename = snapshot_form.format(fold_num,timeStamp())
    path = os.path.join(snapshot_path, filename)
    np.savez_compressed(path, myBopt.X, myBopt.Y)
    print("Save: {}".format(path))

In [4]:
# preprocess train&test dataset
rDF = pd.read_csv(peer_review_csv)
np.random.seed(12345678)
shuffled_pattern = [np.random.permutation(len(rDF)) for i in xrange(trial_num)]
df = rDF.take(shuffled_pattern[0]).reset_index(drop=True)
train, val, test = splitDataset523(df)

In [5]:
pcg5gm = PCG5GM(train, val, test)
#pcg5gm.rmseForVal([[2.66850218, 1.26965507, 43.3753167, 83.0844218 , 16.08320701, 69.77258816]])

In [6]:
pcg5gm.rmseForTest([[2.66850218, 1.26965507, 43.3753167, 83.0844218 , 16.08320701, 69.77258816]])

1.175514631691905

In [7]:
pcg5gm.aucForVal([[2.66850218, 1.26965507, 43.3753167, 83.0844218 , 16.08320701, 69.77258816]])

0.50630604359553844

### ↓making rmseForVal(self,hyper_list), aucForVal(self,hyper_list)

In [6]:
stan_fit = pcg5gm.fit([2.66850218, 1.26965507, 43.3753167, 83.0844218 , 16.08320701, 69.77258816])

In [7]:
stan_fit

Inference for Stan model: anon_model_d0dbe7b103bf4af91b8922aac85103df.
4 chains, each with iter=5000; warmup=500; thin=1; 
post-warmup draws per chain=4500, total post-warmup draws=18000.

                   mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
ability[0]         2.88  3.7e-3   0.49   1.92   2.53   2.88   3.24    3.8  18000    1.0
ability[1]         2.78  4.9e-3   0.66   1.41   2.34   2.82   3.28   3.89  18000    1.0
ability[2]         2.77  4.8e-3   0.65    1.4   2.33    2.8   3.25   3.86  18000    1.0
ability[3]         3.32  2.5e-3   0.33   2.66    3.1   3.33   3.57   3.92  18000    1.0
ability[4]         3.24  3.6e-3   0.48   2.18   2.92   3.31   3.63   3.95  18000    1.0
ability[5]         3.91  6.7e-4   0.09   3.67   3.87   3.93   3.97    4.0  18000    1.0
ability[6]         2.81  4.4e-3   0.59   1.61   2.41   2.83   3.25   3.87  18000    1.0
ability[7]          2.6  5.2e-3    0.7   1.16   2.12   2.63   3.12   3.83  18000    1.0
ability[8]         

In [8]:
eap_value = stan_fit.summary()['summary'][:,0]

In [9]:
split = pcg5gm.userNum
ability = eap_value[0:split]
reliability = eap_value[split:split*2]
bias = eap_value[split*2:split*3]
noise = eap_value[split*3+1]

In [10]:
reliability

array([ 2.88144124,  2.78426255,  2.76684981,  3.32883609,  3.24318326,
        3.91411395,  2.81399914,  2.59978625,  2.15992938,  2.90482345,
        2.6676082 ,  2.24839749,  0.93129262,  2.29070064,  3.77495922,
        2.91994339,  0.86738233,  2.59665952,  2.29521959,  1.48247135,
        2.20900588,  2.65186783,  2.29961561,  2.70945561,  2.51206777,
        2.39542656,  2.78807052,  2.2897294 ,  2.29391829,  2.77681113,
        2.04108936,  0.97274966,  3.20144405,  2.76707591,  4.02148165,
        2.29056094,  2.90295322,  3.04147202,  2.04864812,  3.75335235,
        2.75784333,  2.65105944,  2.49190761,  1.82256157,  0.99455821,
        3.53744792,  2.03712203,  3.65821197,  2.69258131,  2.59587617,
        3.114731  ,  3.14057018,  3.85408837,  2.12553943,  2.91616648,
        3.17152174,  2.59069633,  3.95757625,  0.90388576,  2.73179558,
        3.85399945,  2.35899919,  3.88151025,  2.76846301,  1.83248193,
        2.28763547,  3.2356561 ,  2.73213925,  2.92196583,  2.04

In [11]:
sender = pcg5gm.senderVal
receiver = pcg5gm.receiverVal

In [17]:
sender[0], receiver[0]

(96, 12)

In [18]:
val

Unnamed: 0,sender_id,receiver_id,corrected,value
0,96,12,1,3
1,247,151,0,3
2,30,192,0,3
3,21,67,0,4
4,3,227,1,3
5,57,34,0,4
6,165,200,1,1
7,170,140,1,3
8,247,376,1,3
9,34,269,1,1


In [19]:
sender

array([ 96, 247,  30, ..., 123,  34, 101])

In [20]:
valueEst = ability[receiver] + bias[sender]

In [21]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(pcg3gm.valueVal, valueEst))

In [22]:
rmse

1.148471524917538

In [23]:
correctedEst = expit(ability[receiver]+bias[sender]+noise)

In [24]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(pcg3gm.correctedVal, correctedEst, pos_label=1)
auc = metrics.auc(fpr, tpr)

In [25]:
auc

0.50616023911934094

In [26]:
len(pcg5gm.correctedVal), len(correctedEst)

(1050, 1050)