In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from utils import show_rank_metrics

In [2]:
train = pd.read_csv('train.csv', index_col=0).sample(frac=1)
y_train = train['correct']
X_train = train.drop(columns=['name', 'firstChar', 'group', 'correct'])

test = pd.read_csv('test.csv', index_col=0).sample(frac=1)
y_test = test['correct']
X_test = test.drop(columns=['name', 'firstChar', 'group', 'correct'])

In [3]:
s_tr = X_train.sum()
s_te = X_test.sum()

In [4]:
print(f'train median {s_tr.median() / X_train.shape[0] * 100} %')
print(f'test median {s_te.median() / X_test.shape[0] * 100} %')

train median 1.01838656341026 %
test median 0.9055420453091002 %


In [5]:
xs = X_train.iloc[:4 * 10 ** 4, :]
ys = y_train.iloc[:4 * 10 ** 4]

In [6]:
clf = LogisticRegression(random_state=0, C=1e-3, solver='saga', max_iter=10 ** 3, n_jobs=5).fit(xs, ys)

In [7]:
#clf.predict_proba(xs)

In [8]:
#metrics.plot_roc_curve(clf, X_test, y_test)

In [9]:
#metrics.plot_roc_curve(clf, xs, ys)

In [10]:
show_rank_metrics(
    df_group=pd.DataFrame(data=train['group'].iloc[:4 * 10 ** 4].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(xs)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=ys.tolist(), columns=['correct'])
)

(40000, 3)

mean = 1.5373482726423904

top0 = 0.0
top1 = 0.6157796451914099
top2 = 0.8744164332399627
top3 = 0.976657329598506
top4 = 0.9964985994397759
top5 = 0.9992997198879552
top6 = 1.0


In [11]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(218764, 3)

mean = 13.173282624197766

top0 = 0.0
top1 = 0.1649631566436891
top2 = 0.2037081055383884
top3 = 0.23080579985738056
top4 = 0.24815783218445447
top5 = 0.268837651533159
top6 = 0.28892322319942954
top7 = 0.31281198003327787
top8 = 0.33705728547658664
top9 = 0.36344188257665794
top10 = 0.3845971000713097
top11 = 0.4121701925362491
top12 = 0.43807939149037317
top13 = 0.4676729260755883
top14 = 0.5034466365581174
top15 = 0.5424292845257903
top16 = 0.5842643213691466
top17 = 0.6284763489422391
top18 = 0.6697171381031614
top19 = 0.7054908485856906
top20 = 0.7481578321844545
top21 = 0.7901117185642976
top22 = 0.8261231281198004
top23 = 0.8633230330401711
top24 = 0.90171143332541
top25 = 0.9422391252674115
top26 = 1.0


In [12]:
clf = LogisticRegression(random_state=0, C=1e-1, max_iter=10 ** 3, n_jobs=4).fit(xs, ys)

In [13]:
print("lin reg lbfgs\n")
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

lin reg lbfgs

(218764, 3)

mean = 13.225814119324935

top0 = 0.0
top1 = 0.1743522700261469
top2 = 0.213216068457333
top3 = 0.23829332065604944
top4 = 0.25802234371285954
top5 = 0.277513667696696
top6 = 0.29676729260755885
top7 = 0.315188970763014
top8 = 0.3376515331590207
top9 = 0.3594009983361065
top10 = 0.38233895887806035
top11 = 0.40860470644164487
top12 = 0.43903018778226766
top13 = 0.4688614214404564
top14 = 0.5009507962918944
top15 = 0.5388637984311861
top16 = 0.5739244116947944
top17 = 0.6158782980746375
top18 = 0.6560494414071785
top19 = 0.6947943903018778
top20 = 0.7334204896600903
top21 = 0.7753743760399334
top22 = 0.8144758735440932
top23 = 0.8563109103874494
top24 = 0.9011171856429759
top25 = 0.9437841692417399
top26 = 1.0


In [14]:
clf = LogisticRegression(random_state=0, C=1e-1, max_iter=10 ** 3, n_jobs=4, penalty='l1', solver='saga').fit(xs, ys)
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(218764, 3)

mean = 16.35512241502258

top0 = 0.0
top1 = 0.14761112431661516
top2 = 0.16936058949370097
top3 = 0.18302828618968386
top4 = 0.19063465652483955
top5 = 0.1945566912289042
top6 = 0.19931067268837652
top7 = 0.20335155692892798
top8 = 0.20881863560732114
top9 = 0.2144045638222011
top10 = 0.22165438554789638
top11 = 0.23080579985738056
top12 = 0.24494889469931067
top13 = 0.26337057285476584
top14 = 0.29153791300213927
top15 = 0.3277870216306156
top16 = 0.37722842880912766
top17 = 0.425837889232232
top18 = 0.48336106489184694
top19 = 0.5399334442595674
top20 = 0.59828856667459
top21 = 0.6608034228666508
top22 = 0.7224863323033041
top23 = 0.7869027810791538
top24 = 0.846684097932018
top25 = 0.9121701925362491
top26 = 1.0
