In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV


In [2]:
train = pd.read_csv('train.csv', index_col=0).sample(frac=1)
y_train = train['correct']
X_train = train.drop(columns=['name', 'firstChar', 'group', 'correct'])

test = pd.read_csv('test.csv', index_col=0).sample(frac=1)
y_test = test['correct']
X_test = test.drop(columns=['name', 'firstChar', 'group', 'correct'])

In [3]:
def show_rank_metrics(df_group: pd.DataFrame, df_proba: pd.DataFrame, df_true: pd.DataFrame):
    df_metric = pd.concat([df_group, df_proba, df_true], axis=1)
    df_metric = df_metric.sort_values(by=['group', 'proba', 'correct'], ascending=[True, False, True])
    print(df_metric.shape)
    positions = []
    cur_group = -1
    cur_pos = 1
    for row in df_metric.itertuples():
        cur_pos += 1
        if row.group != cur_group:
            cur_pos = 1
            cur_group = row.group
        if row.correct == 1:
            positions.append(cur_pos)
    print(f'\nmean = {np.mean(positions)}\n')

    count = [0] * 9
    for p in positions:
        count[p] += 1
    acc = 0
    sum_all = sum(count)
    for i, c in enumerate(count):
        acc += c
        print(f'top{i} = {acc / sum_all}')

In [4]:
s_tr = X_train.sum()
s_te = X_test.sum()

In [5]:
print(f'train median {s_tr.median() / X_train.shape[0] * 100} %')
print(f'test median {s_te.median() / X_test.shape[0] * 100} %')

train median 1.029810298102981 %
test median 1.0040160642570282 %


In [6]:
xs = X_train.iloc[:4 * 10 ** 4, :]
ys = y_train.iloc[:4 * 10 ** 4]

In [7]:
clf = LogisticRegression(random_state=0, C=1e-3, solver='saga', max_iter=10 ** 3, n_jobs=4).fit(xs, ys)

In [8]:
#clf.predict_proba(xs)

In [9]:
#metrics.plot_roc_curve(clf, X_test, y_test)

In [10]:
#metrics.plot_roc_curve(clf, xs, ys)

In [11]:
show_rank_metrics(
    df_group=pd.DataFrame(data=train['group'].iloc[:4 * 10 ** 4].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(xs)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=ys.tolist(), columns=['correct'])
)

(40000, 3)

mean = 1.5411908646003263

top0 = 0.0
top1 = 0.6194942903752039
top2 = 0.8696982055464927
top3 = 0.9724714518760196
top4 = 0.9971451876019576
top5 = 1.0
top6 = 1.0
top7 = 1.0
top8 = 1.0


In [12]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(107568, 3)

mean = 4.395656700877584

top0 = 0.0
top1 = 0.24029451137884872
top2 = 0.3179384203480589
top3 = 0.3946154990331697
top4 = 0.4865387475829243
top5 = 0.596236799048044
top6 = 0.7187267588874015
top7 = 0.8499925628439685
top8 = 1.0


In [13]:
clf = LogisticRegression(random_state=0, C=1e-1, max_iter=10 ** 3, n_jobs=4).fit(xs, ys)

In [14]:
print("lin reg lbfgs\n")
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

lin reg lbfgs

(107568, 3)

mean = 4.40450691655511

top0 = 0.0
top1 = 0.24133571322326342
top2 = 0.32165699836382566
top3 = 0.39736724676483715
top4 = 0.48750557786702364
top5 = 0.5933363081957459
top6 = 0.7104715156923992
top7 = 0.8438197233377956
top8 = 1.0


In [15]:
clf = LogisticRegression(random_state=0, C=1e-1, max_iter=10 ** 3, n_jobs=4, penalty='l1', solver='saga').fit(xs, ys)
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(107568, 3)

mean = 5.414398334077049

top0 = 0.0
top1 = 0.20422430462591104
top2 = 0.24564926372155288
top3 = 0.2788933511825078
top4 = 0.3158560166592295
top5 = 0.3716346868957311
top6 = 0.4811096236799048
top7 = 0.6882344191581139
top8 = 1.0
