In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import ARDRegression, RidgeClassifier
from sklearn.svm import LinearSVR
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from utils import show_rank_metrics
from sklearn.naive_bayes import MultinomialNB


In [2]:
train = pd.read_csv('train.csv', index_col=0).sample(frac=1)
y_train = train['correct']
X_train = train.drop(columns=['name', 'firstChar', 'group', 'correct'])

test = pd.read_csv('test.csv', index_col=0)
y_test = test['correct']
X_test = test.drop(columns=['name', 'firstChar', 'group', 'correct'])

valid = pd.read_csv('valid.csv', index_col=0)
y_valid = valid['correct']
X_valid = valid.drop(columns=['name', 'firstChar', 'group', 'correct'])

In [3]:
s_tr = X_train.sum()
s_te = X_test.sum()

In [4]:
print(f'train median {s_tr.median() / X_train.shape[0] * 100} %')
print(f'test median {s_te.median() / X_test.shape[0] * 100} %')

train median 1.039040675887443 %
test median 0.9542356377799415 %


In [5]:
xs = X_train.iloc[:10 ** 5, :]
ys = y_train.iloc[:10 ** 5]

In [6]:
clf = ARDRegression(n_iter=10 ** 3).fit(xs, ys)

In [7]:
#clf.predict(xs)

In [8]:
#metrics.plot_roc_curve(clf, X_test, y_test)

In [9]:
#metrics.plot_roc_curve(clf, xs, ys)

In [10]:
show_rank_metrics(
    df_group=pd.DataFrame(data=train['group'].iloc[:10 ** 5].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict(xs), columns=['proba']),
    df_true=pd.DataFrame(data=ys.tolist(), columns=['correct'])
)

(100000, 3)

mean = 2.947962778932153

top0 = 0.0
top1 = 0.28195862318185927
top2 = 0.42759056825368147
top3 = 0.6190261089529316
top4 = 0.8109133616406179
top5 = 0.9323335441322612
top6 = 0.9836480260186106
top7 = 0.9967476736832596
top8 = 0.9998193152046255
top9 = 1.0


In [11]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict(X_test), columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(236210, 3)

mean = 17.548816730875068

top0 = 0.0
top1 = 0.14980737479361586
top2 = 0.18425976884975234
top3 = 0.2042927903137039
top4 = 0.22168409466153
top5 = 0.2341221794166208
top6 = 0.24358833241607045
top7 = 0.2526141992294992
top8 = 0.26340121078701156
top9 = 0.2746285085305449
top10 = 0.2847550908090259
top11 = 0.2929003852504128
top12 = 0.3003852504127683
top13 = 0.30434782608695654
top14 = 0.30787011557512384
top15 = 0.31161254815630157
top16 = 0.3157952669235003
top17 = 0.3200880572372042
top18 = 0.33010456796918
top19 = 0.3477160154100165
top20 = 0.3767749036873968
top21 = 0.4133186571271326
top22 = 0.472757292239956
top23 = 0.554210236653825
top24 = 0.6736378646119978
top25 = 0.8165107319757843
top26 = 1.0


In [12]:
print('##########################################################')

##########################################################


In [12]:
clf_ridge = CalibratedClassifierCV(RidgeClassifier())
clf_ridge.fit(xs, ys)

CalibratedClassifierCV(base_estimator=RidgeClassifier())

In [13]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf_ridge.predict_proba(X_test)[:,1].tolist(), columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(236210, 3)

mean = 13.090809025866813

top0 = 0.0
top1 = 0.1717116125481563
top2 = 0.21177765547605945
top3 = 0.23929554210236653
top4 = 0.2572372041827188
top5 = 0.2774903687396808
top6 = 0.29862410566868464
top7 = 0.31964777105118325
top8 = 0.3403412217941662
top9 = 0.3649972482113374
top10 = 0.39020363236103467
top11 = 0.41563015960374244
top12 = 0.4434782608695652
top13 = 0.4745184369840396
top14 = 0.5079801871216291
top15 = 0.5404512933406714
top16 = 0.5771051183269125
top17 = 0.6165107319757842
top18 = 0.6595487066593285
top19 = 0.7025866813428728
top20 = 0.7476059438635113
top21 = 0.7910842047330765
top22 = 0.8309301045679692
top23 = 0.8728673637864612
top24 = 0.9093010456796918
top25 = 0.9482663731425427
top26 = 1.0


In [15]:
print('#########################################################')

#########################################################


In [2]:
train = pd.read_pickle('df_train_intr_s')
y_train = train['correct']
X_train = train.drop(columns=['group', 'correct'])

test = pd.read_pickle('df_test_intr_s')
y_test = test['correct']
X_test = test.drop(columns=['group', 'correct'])

valid = pd.read_pickle('df_valid_intr_s')
y_valid = valid['correct']
X_valid = valid.drop(columns=['group', 'correct'])

In [6]:
xs = X_train.iloc[:10 ** 5, :]
ys = y_train.iloc[:10 ** 5]

In [3]:
clf_bern = MultinomialNB().fit(X_train, y_train)

In [4]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf_bern.predict_proba(X_test)[:,1].tolist(), columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct']),
    label='test'
)
show_rank_metrics(
    df_group=pd.DataFrame(data=valid['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf_bern.predict_proba(X_valid)[:,1].tolist(), columns=['proba']),
    df_true=pd.DataFrame(data=y_valid.tolist(), columns=['correct']),
    label='valid'
)

test
(227344, 3)

mean = 6.1282021957914

top0 = 0.0
top1 = 0.38311985361390666
top2 = 0.4782708142726441
top3 = 0.5385407136322049
top4 = 0.5870311070448307
top5 = 0.6248856358645929
top6 = 0.6579368709972553
top7 = 0.6852698993595608
top8 = 0.7089432753888381
top9 = 0.7333028362305581
top10 = 0.7596065873741995
top11 = 0.7855672461116194
top12 = 0.8084400731930467
top13 = 0.8309698078682525
top14 = 0.8532708142726441
top15 = 0.8777447392497713
top16 = 0.8955855443732845
top17 = 0.9115965233302836
top18 = 0.9298947849954254
top19 = 0.9469350411710887
top20 = 0.9586001829826166
top21 = 0.9693504117108874
top22 = 0.9776989935956084
top23 = 0.9852470265324794
top24 = 0.9898215919487648
top25 = 0.994167429094236
top26 = 1.0
valid
(121316, 3)

mean = 6.345906558079726

top0 = 0.0
top1 = 0.356408058294042
top2 = 0.454564937848264
top3 = 0.5124303471924561
top4 = 0.5593656236605229
top5 = 0.5968709815687956
top6 = 0.6343763394770682
top7 = 0.6660951564509215
top8 = 0.6948135447921132
top9 = 