In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from utils import show_rank_metrics

In [2]:
train = pd.read_csv('train.csv', index_col=0).sample(frac=1)
y_train = train['correct']
X_train = train.drop(columns=['name', 'firstChar', 'group', 'correct'])

test = pd.read_csv('test.csv', index_col=0).sample(frac=1)
y_test = test['correct']
X_test = test.drop(columns=['name', 'firstChar', 'group', 'correct'])

In [3]:
s_tr = X_train.sum()
s_te = X_test.sum()

In [4]:
print(f'train median {s_tr.median() / X_train.shape[0] * 100} %')
print(f'test median {s_te.median() / X_test.shape[0] * 100} %')

train median 1.01838656341026 %
test median 0.9055420453091002 %


In [5]:
xs = X_train.iloc[:4 * 10 ** 4, :]
ys = y_train.iloc[:4 * 10 ** 4]

In [6]:
clf = LogisticRegression(random_state=0, C=1e-3, solver='saga', max_iter=10 ** 3, n_jobs=5).fit(xs, ys)

In [7]:
#clf.predict_proba(xs)

In [8]:
#metrics.plot_roc_curve(clf, X_test, y_test)

In [9]:
#metrics.plot_roc_curve(clf, xs, ys)

In [10]:
show_rank_metrics(
    df_group=pd.DataFrame(data=train['group'].iloc[:4 * 10 ** 4].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(xs)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=ys.tolist(), columns=['correct'])
)

(40000, 3)

mean = 1.5487452809238285

top0 = 0.0
top1 = 0.6087053075727293
top2 = 0.8758605374194981
top3 = 0.9706862091938707
top4 = 0.9964468132356207
top5 = 0.9995558516544526
top6 = 1.0


In [11]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(218764, 3)

mean = 13.249346327549322

top0 = 0.0
top1 = 0.15533634418825767
top2 = 0.19681483242215356
top3 = 0.2265272165438555
top4 = 0.24958402662229617
top5 = 0.2694318992155931
top6 = 0.29248870929403376
top7 = 0.31269313049679104
top8 = 0.3352745424292845
top9 = 0.35880675065367246
top10 = 0.3813881625861659
top11 = 0.4089612550511053
top12 = 0.4372474447349655
top13 = 0.46707867839315426
top14 = 0.5034466365581174
top15 = 0.537675303066318
top16 = 0.5795103399096744
top17 = 0.6169479439030188
top18 = 0.6568813881625861
top19 = 0.697171381031614
top20 = 0.7407891609222724
top21 = 0.7808414547183266
top22 = 0.8237461373900642
top23 = 0.8658188733063941
top24 = 0.9090801045875921
top25 = 0.9471119562633705
top26 = 1.0


In [12]:
clf = LogisticRegression(random_state=0, C=1e-1, max_iter=10 ** 3, n_jobs=4).fit(xs, ys)

In [13]:
print("lin reg lbfgs\n")
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

lin reg lbfgs

(218764, 3)

mean = 13.280484906108866

top0 = 0.0
top1 = 0.15295935345852152
top2 = 0.19776562871404801
top3 = 0.23223199429522226
top4 = 0.2598050867601616
top5 = 0.28309959591157596
top6 = 0.30425481340622773
top7 = 0.32315188970763015
top8 = 0.3409793201806513
top9 = 0.36189683860232946
top10 = 0.38008081768481106
top11 = 0.40099833610648916
top12 = 0.4277394818160209
top13 = 0.4582838126931305
top14 = 0.49417637271214643
top15 = 0.532089374851438
top16 = 0.5720228191110055
top17 = 0.6086284763489422
top18 = 0.6474922747801284
top19 = 0.691585452816734
top20 = 0.7325885429046827
top21 = 0.7775136676966959
top22 = 0.8219633943427621
top23 = 0.8639172807226052
top24 = 0.906108866175422
top25 = 0.9481816020917518
top26 = 1.0


In [14]:
clf = LogisticRegression(random_state=0, C=1e-1, max_iter=10 ** 3, n_jobs=4, penalty='l1', solver='saga').fit(xs, ys)
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(218764, 3)

mean = 16.351794628000953

top0 = 0.0
top1 = 0.13477537437603992
top2 = 0.1587829807463751
top3 = 0.17185642975992393
top4 = 0.1804135963869741
top5 = 0.18599952460185407
top6 = 0.191585452816734
top7 = 0.19657713334917995
top8 = 0.20335155692892798
top9 = 0.2096505823627288
top10 = 0.21761350130734491
top11 = 0.22593296886142145
top12 = 0.23971951509389114
top13 = 0.2590919895412408
top14 = 0.2859519847872593
top15 = 0.3197052531495127
top16 = 0.364036130259092
top17 = 0.415973377703827
top18 = 0.4770620394580461
top19 = 0.5377941526028048
top20 = 0.6096981221773236
top21 = 0.6843356310910388
top22 = 0.7551699548371761
top23 = 0.8210125980508676
top24 = 0.8773472783456144
top25 = 0.9247682434038508
top26 = 1.0


In [15]:
from joblib import dump, load

dump(clf, 'clf_linreg.joblib')
# 17 kb

['clf_linreg.joblib']