In [8]:

import json
import os
import random

import pandas as pd

from utils import show_rank_metrics

random.seed(0)

In [9]:
train = pd.read_csv('train.csv', index_col=0).sample(frac=1)
y_train = train['correct']
X_train = train[['name']]

test = pd.read_csv('test.csv', index_col=0).sample(frac=1)
y_test = test['correct']
X_test = test[['name']]


valid = pd.read_csv('valid.csv', index_col=0)
y_valid = valid['correct']
X_valid = valid[['name']]


In [10]:
with open(os.path.join(os.getcwd(), 'FirstCharToNameList.txt'), 'r') as f:
    firstCharToNameList = json.load(f)

len(firstCharToNameList)

25

In [11]:
nameToProbability = {}
for ch in firstCharToNameList:
    names = firstCharToNameList[ch]
    N = len(names)
    for i, name in enumerate(names):
        nameToProbability[name] = (1.0 - i * 1.0 / N)


def getProba(name):
    if name in nameToProbability:
        return nameToProbability[name]
    else:
        return 0


In [12]:
print("Popular test")
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=[getProba(name) for name in X_test['name']], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

Popular test
(218764, 3)

mean = 13.347278345614452

top0 = 0.0
top1 = 0.24934632754932257
top2 = 0.2531495127169004
top3 = 0.2590919895412408
top4 = 0.2703826955074875
top5 = 0.2840503922034704
top6 = 0.29985738055621586
top7 = 0.31863560732113144
top8 = 0.34133586879011174
top9 = 0.3637984311861184
top10 = 0.39279771808889946
top11 = 0.41894461611599715
top12 = 0.4462800095079629
top13 = 0.47777513667696697
top14 = 0.5093891133824578
top15 = 0.5394580461136201
top16 = 0.5754694556691229
top17 = 0.6099358212502971
top18 = 0.644639885904445
top19 = 0.6794628000950796
top20 = 0.7136914666032802
top21 = 0.7455431423817447
top22 = 0.7770382695507487
top23 = 0.8038982647967673
top24 = 0.8301640123603518
top25 = 0.848585690515807
top26 = 1.0


In [13]:
print("Popular valid")
show_rank_metrics(
    df_group=pd.DataFrame(data=valid['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=[getProba(name) for name in X_valid['name']], columns=['proba']),
    df_true=pd.DataFrame(data=y_valid.tolist(), columns=['correct'])
)

Popular valid
(113854, 3)

mean = 13.08312400091345

top0 = 0.0
top1 = 0.27289335464717973
top2 = 0.275405343685773
top3 = 0.2845398492806577
top4 = 0.2961863439141357
top5 = 0.3080612011874857
top6 = 0.3256451244576387
top7 = 0.34117378396894266
top8 = 0.36172642155743323
top9 = 0.37999543274720254
top10 = 0.40237497145467
top11 = 0.4272664992007308
top12 = 0.4507878511075588
top13 = 0.47978990637131763
top14 = 0.5122174012331583
top15 = 0.5432747202557662
top16 = 0.5754738524777346
top17 = 0.607444622059831
top18 = 0.6414706554007764
top19 = 0.6773235898606987
top20 = 0.7161452386389586
top21 = 0.7490294587805435
top22 = 0.7814569536423841
top23 = 0.8127426353048641
top24 = 0.8369490751313086
top25 = 0.857501712719799
top26 = 1.0


In [14]:
y_test_pred_random = [random.random() for name in X_test['name']]

In [15]:
print("Random")
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=y_test_pred_random, columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

Random
(218764, 3)

mean = 13.495483717613501

top0 = 0.0
top1 = 0.040646541478488236
top2 = 0.0801045875921084
top3 = 0.12170192536249108
top4 = 0.15818873306394104
top5 = 0.19954837176135012
top6 = 0.23639172807226053
top7 = 0.2765628714048015
top8 = 0.31138578559543617
top9 = 0.35001188495364866
top10 = 0.386260993582125
top11 = 0.4229855003565486
top12 = 0.4641074399809841
top13 = 0.49762300927026387
top14 = 0.533990967435227
top15 = 0.5730924649393867
top16 = 0.6142144045638221
top17 = 0.6505823627287853
top18 = 0.6880199667221298
top19 = 0.7233182790587117
top20 = 0.7650344663655811
top21 = 0.8043736629427145
top22 = 0.8413358687901117
top23 = 0.8798431186118374
top24 = 0.9225101022106014
top25 = 0.9626812455431424
top26 = 1.0
