In [1]:
import pandas as pd
from scipy import interp
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score
from scipy import stats

In [2]:
# Read the output from JAGS
model = pd.read_csv('dataset/model.csv')
model = model[model.columns[0:8]]
model.head()

Unnamed: 0,beta0,beta[1],beta[2],beta[3],beta[4],beta[5],beta[6],beta[7]
0,-4.79368,0.013326,0.000791,4.5e-05,0.000557,0.204492,0.230418,0.02662
1,-4.16414,0.010172,0.000816,2.9e-05,0.000439,-0.138506,-0.00452,-0.06479
2,-4.61404,0.01193,0.000726,0.000189,0.000492,0.305254,0.03062,0.042974
3,-4.70308,0.011693,0.000938,0.00014,0.000454,0.17456,0.282329,-0.037926
4,-5.01021,0.016935,0.000637,-1.3e-05,0.000553,0.482652,-0.014611,0.166988


In [3]:
# Read the test set
test = pd.read_csv('dataset/test.csv').drop('Unnamed: 0',axis=1)
cols = ['runtime', 'actor_name_score', 'studio_score','crew_member_score', 
          'season_autumn', 'season_spring','season_summer']
test = test[cols]
test.head()

Unnamed: 0,runtime,actor_name_score,studio_score,crew_member_score,season_autumn,season_spring,season_summer
0,148,1547.013755,1102.121751,6859.013397,1,0,0
1,165,5188.166899,2155.532879,10077.12577,0,0,1
2,132,1336.034036,574.998727,5118.766659,0,1,0
3,141,3169.442028,185.550221,4803.724619,0,1,0
4,151,3423.428882,2154.021479,5211.644889,0,1,0


In [4]:
# Thanks to Pedro Uria and Sean Pelli.

from time import time
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score

USE_MODE, USE_MODE_THRESHOLD = False, 0.5  # This means use mode as coefficient and do regular logistic regression.
# logits greater or equal than THRESHOLD are considered to be fake reviews
USE_SAMPLES = True  # This means getting one z for each sampled betas, then use the mode of distribution and apply
APPLY_MODE, APPLY_MEAN, APPLY_MAX, USE_SAMPLES_THRESHOLD = True, False, False, 0.5  # sigmoid on this (APPLY_MODE)
# Another option is getting one logit for each sample and using the mean (APPLY_MEAN)
# Another option is getting one prediction for each sample and then using the mode (APPLY_MAX)

# TODO: Use logits from USE_SAMPLES APPLY_MEAN before taking the mean, and train a logistic regression with that (crazy I know)

BETA_DIR_NAME = "results"
# This FEATURES needs to correspond with the betas (first element is beta1, second is beta2, etc...)
FEATURES = ['runtime', 'actor_name_score', 'studio_score','crew_member_score', 'season_autumn', 
            'season_spring','season_summer']

data_test = pd.read_csv('dataset/test.csv')
y = data_test["target"].values
priors = pd.read_csv('dataset/model.csv')
betas_samples = {"Intercept": priors["beta0"].values}
for i in range(len(FEATURES)):
    betas_samples[FEATURES[i]] = priors["beta["+str(i+1)+"]"].values

if USE_MODE:

    betas_modes = {}
    for key in betas_samples.keys():
        betas_modes[key] = stats.mode(betas_samples[key])[0][0]

    z = betas_modes["Intercept"]
    for key in FEATURES:
        z += betas_modes[key] * data_test[key].values
    logits = 1 / (1 + np.exp(-z))

    logits_copy = np.copy(logits)
    logits_copy[logits_copy >= USE_MODE_THRESHOLD] = 1
    logits_copy[logits_copy < USE_MODE_THRESHOLD] = 0

    print("Accuracy:", accuracy_score(y, logits_copy), ", Recall:", recall_score(y, logits_copy))
    print(pd.crosstab(y, logits_copy, rownames=['True'], colnames=['Predicted'], margins=True))

    while input("You used {} threshold... Would you like to get the results for another one? ([y]/n) ".format(USE_MODE_THRESHOLD)) != "n":
        USE_MODE_THRESHOLD = float(input("Enter threshold: "))
        logits_copy = np.copy(logits)
        logits_copy[logits_copy >= USE_MODE_THRESHOLD] = 1
        logits_copy[logits_copy < USE_MODE_THRESHOLD] = 0
        print("Accuracy:", accuracy_score(y, logits_copy), ", Recall:", recall_score(y, logits_copy))
        print(pd.crosstab(y, logits_copy, rownames=['True'], colnames=['Predicted'], margins=True))


if USE_SAMPLES:

    start = time()

    # betas_samples (as a Numpy Array) is 15000 x (len(FEATURES) + 1)
    # data_test (as a Numpy Array) is 18463 x len(FEATURES)
    b_samples = np.empty((len(betas_samples[FEATURES[0]]), (1 + len(FEATURES))))
    for i, key in enumerate(betas_samples.keys()):
        b_samples[:, i] = betas_samples[key]
    z_samples = b_samples[:, 0] + np.dot(data_test[FEATURES].values, b_samples[:, 1:].T)

    if APPLY_MODE:
        z_modes = stats.mode(z_samples.T)[0]
        logits = 1 / (1 + np.exp(-z_modes))
    if APPLY_MEAN:
        logits_samples = 1 / (1 + np.exp(-z_samples))
        logits = np.mean(logits_samples, axis=1)
    if APPLY_MAX:
        logits_samples = 1 / (1 + np.exp(-z_samples))
        logits_samples_copy = np.copy(logits_samples)
        logits_samples_copy[logits_samples_copy >= USE_SAMPLES_THRESHOLD] = 1
        logits_samples_copy[logits_samples_copy < USE_SAMPLES_THRESHOLD] = 0
        y_pred = stats.mode(logits_samples_copy.T)[0].reshape(-1)
    else:
        logits_copy = np.copy(logits)
        logits_copy[logits_copy >= USE_SAMPLES_THRESHOLD] = 1
        logits_copy[logits_copy < USE_SAMPLES_THRESHOLD] = 0
        y_pred = logits_copy.reshape(-1)

    print(time() - start)
    print("Accuracy:", accuracy_score(y, y_pred), ", Recall:", recall_score(y, y_pred))
    print(pd.crosstab(y, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

    while input("You used {} threshold... Would you like to get the results for another one? ([y]/n) ".format(USE_SAMPLES_THRESHOLD)) != "n":
        USE_SAMPLES_THRESHOLD = float(input("Enter threshold: "))
        if APPLY_MAX:
            logits_samples_copy = np.copy(logits_samples)
            logits_samples_copy[logits_samples_copy >= USE_SAMPLES_THRESHOLD] = 1
            logits_samples_copy[logits_samples_copy < USE_SAMPLES_THRESHOLD] = 0
            y_pred = stats.mode(logits_samples_copy.T)[0].reshape(-1)
            score = roc_auc_score(y, y_pred)
        else:
            logits_copy = np.copy(logits)
            logits_copy[logits_copy >= USE_SAMPLES_THRESHOLD] = 1
            logits_copy[logits_copy < USE_SAMPLES_THRESHOLD] = 0
            y_pred = logits_copy.reshape(-1)
            score = roc_auc_score(y, y_pred)
        print("Accuracy:", accuracy_score(y, y_pred), ", Recall:", recall_score(y, y_pred))
        print(pd.crosstab(y, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
        print('AUC Score: {}'.format(score))

0.9966771602630615
Accuracy: 0.7340529931305201 , Recall: 0.11643835616438356
Predicted  0.0  1.0   All
True                     
0          714   13   727
1          258   34   292
All        972   47  1019
You used 0.5 threshold... Would you like to get the results for another one? ([y]/n) 0.1
Enter threshold: 0.01
Accuracy: 0.30225711481844947 , Recall: 1.0
Predicted  0.0   1.0   All
True                      
0           16   711   727
1            0   292   292
All         16  1003  1019
AUC Score: 0.5110041265474553
You used 0.01 threshold... Would you like to get the results for another one? ([y]/n) y
Enter threshold: 0.08
Accuracy: 0.718351324828263 , Recall: 0.6643835616438356
Predicted  0.0  1.0   All
True                     
0          538  189   727
1           98  194   292
All        636  383  1019
AUC Score: 0.7022055359801022
You used 0.08 threshold... Would you like to get the results for another one? ([y]/n) n
