In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error,f1_score
import random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from functions import to_one_hot
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier,NearestCentroid,RadiusNeighborsClassifier
from sklearn.svm import SVC,NuSVC,LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_validate
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier,PassiveAggressiveClassifier
scorings=['accuracy',
         'balanced_accuracy',
         'average_precision',
         'neg_brier_score',
         'f1',
         'f1_micro',
         'f1_macro',
         'f1_weighted',
         'neg_log_loss',
         'precision',
         'recall',
         'jaccard',
         'roc_auc',
         'roc_auc_ovr',
         'roc_auc_ovo',
         'roc_auc_ovr_weighted',
         'roc_auc_ovo_weighted']
classifiers = [
    KNeighborsClassifier(3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    RidgeClassifier(),
    SGDClassifier(max_iter=1000, tol=1e-3),
    PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3),
    NearestCentroid(),
    RadiusNeighborsClassifier(radius=1.0),
    VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard'),
    BaggingClassifier(base_estimator=SVC(),n_estimators=10, random_state=0),
    ExtraTreesClassifier(n_estimators=100, random_state=0),
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    NuSVC(),
    LinearSVC(random_state=0, tol=1e-5),
]
names = [
    "Nearest Neighbors", 
    "Gaussian Process",
    "Decision Tree", 
    "Random Forest", 
    "Neural Net", 
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "Ridge",
    "SGD",
    "PassiveAggressive",
    "NearestCentroid",
    "Radius Neighbors",
    "Voting",
    "Bagging",
    "ExtraTrees",
    "GradientBoosting",
    "Linear SVM", 
    "RBF SVM",
    "Nu SVM",
    "Linear SVM2"
]
classifier=[]
score_name=[]
scoring=[]
scoring_avg=[]
data=[]

In [None]:
#QDA,Naive Bayes,AdaBoost,Neural Net
per_player=pd.read_csv('per_player.csv')
fig = plt.figure(figsize=(10,5))
ax = fig.gca()
plt.title('MSE individuals per player')
plt.grid()
ax.set_xticks(np.arange(0,1, 0.05))
sns.histplot(data=per_player, x='MSE',bins=210)
for i in range(len(classifiers)):
    scores=cross_validate(classifiers[i], per_player.drop(columns=['good_bad','MSE','MSE_var']), per_player['good_bad'],scoring=scorings)
    for score in scorings:
        data.append("individuals per player")
        classifier.append(names[i])
        score_name.append(score)
        scoring.append(scores['test_'+str(score)])
        scoring_avg.append(sum(scores['test_'+str(score)])/5)

In [None]:
#Naive Bayes,Random Forest,QDA
per_game=pd.read_csv('per_game.csv')
fig = plt.figure(figsize=(10,5))
ax = fig.gca()
plt.title('MSE individuals per game')
plt.grid()
ax.set_xticks(np.arange(0,1, 0.05))
sns.histplot(data=per_game, x='MSE',bins=210)
for i in range(len(classifiers)):
    scores=cross_validate(classifiers[i],per_game.drop(columns=['good_bad','MSE','MSE_var']), per_game['good_bad'],scoring=scorings)
    for score in scorings:
        data.append("individuals per game")
        classifier.append(names[i])
        score_name.append(score)
        scoring.append(scores['test_'+str(score)])
        scoring_avg.append(sum(scores['test_'+str(score)])/5)

In [None]:
#AdaBoost,Random Forest,Decision Tree,Gaussian Process,
real=pd.read_csv('real.csv')
real=real[(real['GameID']<211)|((270<real['GameID'])&(real['GameID']<481))]
fig = plt.figure(figsize=(10,5))
ax = fig.gca()
plt.title('aggregate')
plt.grid()
ax.set_xticks(np.arange(0,1, 0.01))
sns.histplot(data=real, x='MSE',bins=210)
for i in range(len(classifiers)):
    scores=cross_validate(classifiers[i], real.drop(columns=['good_bad','MSE']), real['good_bad'],scoring=scorings)
    for score in scorings:
        data.append("aggregate")
        classifier.append(names[i])
        score_name.append(score)
        scoring.append(scores['test_'+str(score)])
        scoring_avg.append(sum(scores['test_'+str(score)])/5)

In [None]:
results=pd.DataFrame.from_dict({'classifier':classifier,'score_name':score_name,'scoring':scoring,'scoring avg':scoring_avg,'data':data})
results.to_csv('results.csv',index=False)

In [None]:
#Naive Bayes,Neural Net,QDA,Random Forest
df=pd.read_csv('per_game_and_player.csv')
fig = plt.figure(figsize=(10,5))
ax = fig.gca()
plt.title('MSE individuals per game and player')
plt.grid()
ax.set_xticks(np.arange(0,1, 0.05))
sns.histplot(data=df, x='MSE',bins=210)
for i in range(len(classifiers)):
    #if i!=1:
    scores=cross_validate(classifiers[i], df.drop(columns=['B.1','B.1_baseline','B.2','B.2_baseline','B.3','B.3_baseline','B.4','B.4_baseline','B.5','B.5_baseline','good_bad','MSE']), df['good_bad'],scoring=scorings)
    for score in scorings:
        data.append("individuals per game and player")
        classifier.append(names[i])
        score_name.append(score)
        scoring.append(scores['test_'+str(score)])
        scoring_avg.append(sum(scores['test_'+str(score)])/5)

In [None]:
df=pd.read_csv('test.csv')
df=df[df['player_id_in_group']==2]
def TR_average(participant_code,hotel_id,df):
    return df[(df['participant_code']!=participant_code)&(df['hotel_id']==hotel_id)]['TR'].mean()

def TR_majorty(TR_avg):
    if TR_avg>0.5:
        return 1
    elif TR_avg<0.5:
        return 0
    else:
        return random.sample(range(2),1)[0]
def choice_rate(participant_code,subsession_round_number,column,df):
    return df[(df['participant_code']==participant_code)&(df['subsession_round_number']>subsession_round_number)][column].mean()
def to_bin(CR):
    if CR<0.25:
        return 1
    elif CR<0.5:
        return 2
    elif CR<0.75:
        return 3
    else:
        return 4 
def is_same(true,pred):
    if true==pred:
        return 1
    else:
        return 0
def is_same_and_label(same,label):
    return same*label
def per_trial(participant_code,subsession_round_number,column,df):
    return df[(df['participant_code']==participant_code)&(df['subsession_round_number']>subsession_round_number)][column].sum()
def labels(metric,label,df):
    df[metric[:len(metric)-8]+'_is_'+str(label)]=df.apply(lambda x:is_same(x[metric[:len(metric)-8]],label),axis=1)
    df[metric+'_is_'+str(label)]=df.apply(lambda x:is_same(x[metric],label),axis=1)
    df[metric+'_is_same_and_'+str(label)]=df.apply(lambda x:is_same_and_label(x[metric+'_is_same'],x[metric+'_is_'+str(label)]),axis=1)
    return df
df['TR']=df['player_answer']
df['TR_average']=df.apply(lambda x: TR_average(x['participant_code'],x['hotel_id'],df),axis=1)
df['TR_majorty']=df.apply(lambda x: TR_majorty(x['TR_average']),axis=1)
df['CR']=df.apply(lambda x: choice_rate(x['participant_code'],x['subsession_round_number'],'TR',df),axis=1)
df['CR_average']=df.apply(lambda x: choice_rate(x['participant_code'],x['subsession_round_number'],'TR_average',df),axis=1)
df['CR_majorty']=df.apply(lambda x: choice_rate(x['participant_code'],x['subsession_round_number'],'TR_majorty',df),axis=1)
df['CR_bin']=df.apply(lambda x: to_bin(x['CR']),axis=1)
df['CR_bin_average']=df.apply(lambda x: to_bin(x['CR_average']),axis=1)
df['CR_bin_majorty']=df.apply(lambda x: to_bin(x['CR_majorty']),axis=1)
TR_F1=0
TR_bins_F1=[]
df['TR_majorty_is_same']=df.apply(lambda x:is_same(x['TR'],x['TR_majorty']),axis=1)
df['TR_majorty_is_same_per_trial']=df.apply(lambda x:per_trial(x['participant_code'],x['subsession_round_number'],'TR_majorty_is_same',df),axis=1)
no_last_trial=df[df['subsession_round_number']<10]
TR_accuracy=no_last_trial['TR_majorty_is_same_per_trial'].sum()/(10-no_last_trial['subsession_round_number']).sum()
for label in range(2):
    df=labels('TR_majorty',label,df)
    df['TR_is_'+str(label)+'_per_trial']=df.apply(lambda x:per_trial(x['participant_code'],x['subsession_round_number'],'TR_is_'+str(label),df),axis=1)
    df['TR_majorty_is_'+str(label)+'_per_trial']=df.apply(lambda x:per_trial(x['participant_code'],x['subsession_round_number'],'TR_majorty_is_'+str(label),df),axis=1)
    df['TR_majorty_is_same_and_'+str(label)+'_per_trial']=df.apply(lambda x:per_trial(x['participant_code'],x['subsession_round_number'],'TR_majorty_is_same_and_'+str(label),df),axis=1)
    no_last_trial=df[df['subsession_round_number']<10]
    recall=no_last_trial['TR_majorty_is_same_and_'+str(label)+'_per_trial'].sum()/no_last_trial['TR'+'_is_'+str(label)+'_per_trial'].sum()
    precision=no_last_trial['TR_majorty_is_same_and_'+str(label)+'_per_trial'].sum()/no_last_trial['TR_majorty_is_'+str(label)+'_per_trial'].sum()
    F1=2*recall*precision/(recall+precision)
    TR_bins_F1.append(F1)
    TR_F1+=F1/2
print("TR_accuracy:",TR_accuracy,"TR_F1",TR_F1)
print("TR_bins_F1, hotel=0,hotel=1:",TR_bins_F1)
CR_RMSE=[]
CR_F1=[0,0]
CR_bins_F1=[[],[]]
metrics=['CR_bin_average','CR_bin_majorty']
for i in range(2):
    no_last_trial=df[df['subsession_round_number']<10]
    CR_RMSE.append(mean_squared_error(no_last_trial['CR_'+metrics[i][7:]],no_last_trial['CR'],squared=False))
    df[metrics[i]+'_is_same']=df.apply(lambda x:is_same(x['CR_bin'],x[metrics[i]]),axis=1)
    for label in range(1,5):
        df=labels(metrics[i],label,df)
        no_last_trial=df[df['subsession_round_number']<10]
        recall=no_last_trial[metrics[i]+'_is_same_and_'+str(label)].sum()/no_last_trial['CR_bin_is_'+str(label)].sum()
        precision=no_last_trial[metrics[i]+'_is_same_and_'+str(label)].sum()/no_last_trial[metrics[i]+'_is_'+str(label)].sum()
        F1=2*recall*precision/(recall+precision)
        CR_bins_F1[i].append(F1)
        CR_F1[i]+=F1/4
print("CR_MSE, avg vs majority:",CR_RMSE,"CR_F1, avg vs majority:",CR_F1)
print('CR_bins_F1, avg vs majority: CR<0.25, 0.25<CR<0.5, 0.5<CR<0.75, 0.75<CR',CR_bins_F1)