In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import ARDRegression, RidgeClassifier
from sklearn.svm import LinearSVR
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from utils import show_rank_metrics


In [2]:
train = pd.read_csv('train.csv', index_col=0).sample(frac=1)
y_train = train['correct']
X_train = train.drop(columns=['name', 'firstChar', 'group', 'correct'])

test = pd.read_csv('test.csv', index_col=0).sample(frac=1)
y_test = test['correct']
X_test = test.drop(columns=['name', 'firstChar', 'group', 'correct'])

In [3]:
s_tr = X_train.sum()
s_te = X_test.sum()

In [4]:
print(f'train median {s_tr.median() / X_train.shape[0] * 100} %')
print(f'test median {s_te.median() / X_test.shape[0] * 100} %')

train median 1.01838656341026 %
test median 0.9055420453091002 %


In [5]:
xs = X_train.iloc[:10 ** 5, :]
ys = y_train.iloc[:10 ** 5]

In [6]:
clf = ARDRegression(n_iter=10 ** 3).fit(xs, ys)

In [7]:
clf.predict(xs)

array([0.10837171, 0.10837171, 0.11683745, ..., 0.10837171, 0.10837171,
       0.10837171])

In [8]:
#metrics.plot_roc_curve(clf, X_test, y_test)

In [9]:
#metrics.plot_roc_curve(clf, xs, ys)

In [10]:
show_rank_metrics(
    df_group=pd.DataFrame(data=train['group'].iloc[:10 ** 5].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict(xs), columns=['proba']),
    df_true=pd.DataFrame(data=ys.tolist(), columns=['correct'])
)

(100000, 3)

mean = 2.9508778152154638

top0 = 0.0
top1 = 0.2619258733818053
top2 = 0.41948927114736656
top3 = 0.6228941301649229
top4 = 0.8203582195424721
top5 = 0.9404149671927646
top6 = 0.9863450966483419
top7 = 0.9976946267068629
top8 = 1.0


In [11]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict(X_test), columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(218764, 3)

mean = 18.162110767768006

top0 = 0.0
top1 = 0.1389351081530782
top2 = 0.16627050154504397
top3 = 0.18552412645590682
top4 = 0.20073686712621822
top5 = 0.21190872355597812
top6 = 0.21951509389113383
top7 = 0.224982172569527
top8 = 0.22926075588305206
top9 = 0.23330164012360352
top10 = 0.23698597575469454
top11 = 0.23948181602091753
top12 = 0.2426907535060613
top13 = 0.24578084145471832
top14 = 0.24887092940337532
top15 = 0.25386260993582127
top16 = 0.25968623722367484
top17 = 0.2696695982885667
top18 = 0.28512003803185165
top19 = 0.3117423342048966
top20 = 0.35167577846446396
top21 = 0.4108628476348942
top22 = 0.49346327549322555
top23 = 0.5967435227002614
top24 = 0.7224863323033041
top25 = 0.8583313525077252
top26 = 1.0


In [12]:
print('##########################################################')

##########################################################


In [13]:
clf_ridge = CalibratedClassifierCV(RidgeClassifier())
clf_ridge.fit(xs, ys)

CalibratedClassifierCV(base_estimator=RidgeClassifier())

In [14]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf_ridge.predict_proba(X_test)[:,1].tolist(), columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(218764, 3)

mean = 13.469693368195864

top0 = 0.0
top1 = 0.17149988115046352
top2 = 0.20929403375326835
top3 = 0.23520323270739243
top4 = 0.2516044687425719
top5 = 0.2700261468980271
top6 = 0.28725932968861423
top7 = 0.30425481340622773
top8 = 0.32505348229141906
top9 = 0.34359400998336104
top10 = 0.3655811742334205
top11 = 0.3920846208699786
top12 = 0.42464939386736394
top13 = 0.45947230805799855
top14 = 0.49275017827430473
top15 = 0.5276919420014262
top16 = 0.5626337057285477
top17 = 0.6045875921083907
top18 = 0.642738293320656
top19 = 0.6824340385072498
top20 = 0.7210601378654623
top21 = 0.7662229617304492
top22 = 0.8069883527454242
top23 = 0.8515569289279772
top24 = 0.8938673639172807
top25 = 0.93819824102686
top26 = 1.0


In [15]:
print('#########################################################')

#########################################################


In [16]:
xs = X_train.iloc[:3*10 ** 4, :]
ys = y_train.iloc[:3*10 ** 4]

In [17]:
svm = LinearSVC(max_iter=10**4)
clf_svm = CalibratedClassifierCV(svm)
clf_svm.fit(xs, ys)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=10000))

In [18]:
show_rank_metrics(
    df_group=pd.DataFrame(data=train.iloc[:3*10 ** 4, :]['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf_svm.predict_proba(xs)[:,1].tolist(), columns=['proba']),
    df_true=pd.DataFrame(data=ys.tolist(), columns=['correct'])
)

(30000, 3)

mean = 1.3836347465164542

top0 = 0.0
top1 = 0.6952268010672992
top2 = 0.9335902757189446
top3 = 0.9884375926474949
top4 = 0.9991105840498073
top5 = 1.0


In [19]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf_svm.predict_proba(X_test)[:,1].tolist(), columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(218764, 3)

mean = 13.358687901117186

top0 = 0.0
top1 = 0.15379130021392917
top2 = 0.20168766341811267
top3 = 0.23864986926550985
top4 = 0.2653910149750416
top5 = 0.2823864986926551
top6 = 0.29985738055621586
top7 = 0.3194675540765391
top8 = 0.33895887806037556
top9 = 0.36047064416448776
top10 = 0.38019966722129783
top11 = 0.4032564772997385
top12 = 0.4308295697646779
top13 = 0.45792726408367007
top14 = 0.488233895887806
top15 = 0.5230568100784407
top16 = 0.5591870691704303
top17 = 0.5993582125029713
top18 = 0.6414309484193012
top19 = 0.6845733301640123
top20 = 0.7283099595911576
top21 = 0.7722842880912765
top22 = 0.8147135726170668
top23 = 0.857261706679344
top24 = 0.8981459472308058
top25 = 0.941882576657951
top26 = 1.0
