In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import ARDRegression, RidgeClassifier
from sklearn.svm import LinearSVR



In [2]:
train = pd.read_csv('train.csv', index_col=0).sample(frac=1)
y_train = train['correct']
X_train = train.drop(columns=['name', 'firstChar', 'group', 'correct'])

test = pd.read_csv('test.csv', index_col=0).sample(frac=1)
y_test = test['correct']
X_test = test.drop(columns=['name', 'firstChar', 'group', 'correct'])

In [3]:
def show_rank_metrics(df_group: pd.DataFrame, df_proba: pd.DataFrame, df_true: pd.DataFrame):
    df_metric = pd.concat([df_group, df_proba, df_true], axis=1)
    df_metric = df_metric.sort_values(by=['group', 'proba', 'correct'], ascending=[True, False, True])
    print(df_metric.shape)
    positions = []
    cur_group = -1
    cur_pos = 1
    for row in df_metric.itertuples():
        cur_pos += 1
        if row.group != cur_group:
            cur_pos = 1
            cur_group = row.group
        if row.correct == 1:
            positions.append(cur_pos)
    print(f'\nmean = {np.mean(positions)}\n')

    count = [0] * 9
    for p in positions:
        count[p] += 1
    acc = 0
    sum_all = sum(count)
    for i, c in enumerate(count):
        acc += c
        print(f'top{i} = {acc / sum_all}')

In [4]:
s_tr = X_train.sum()
s_te = X_test.sum()

In [5]:
print(f'train median {s_tr.median() / X_train.shape[0] * 100} %')
print(f'test median {s_te.median() / X_test.shape[0] * 100} %')

train median 1.029810298102981 %
test median 1.0040160642570282 %


In [6]:
xs = X_train.iloc[:10 ** 5, :]
ys = y_train.iloc[:10 ** 5]

In [7]:
clf = ARDRegression(n_iter=10 ** 3).fit(xs, ys)

In [8]:
clf.predict(xs)

array([0.11955742, 0.12917517, 0.11955742, ..., 0.11955742, 0.11955742,
       0.11955742])

In [9]:
#metrics.plot_roc_curve(clf, X_test, y_test)

In [10]:
#metrics.plot_roc_curve(clf, xs, ys)

In [11]:
show_rank_metrics(
    df_group=pd.DataFrame(data=train['group'].iloc[:10 ** 5].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict(xs), columns=['proba']),
    df_true=pd.DataFrame(data=ys.tolist(), columns=['correct'])
)

(100000, 3)

mean = 2.9092154019655228

top0 = 0.0
top1 = 0.27597873368777187
top2 = 0.42798453359110683
top3 = 0.6273562107298212
top4 = 0.8260834541646528
top5 = 0.9450620267439988
top6 = 0.9889640728210085
top7 = 0.9993555662961173
top8 = 1.0


In [12]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict(X_test), columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(107568, 3)

mean = 5.649189349992563

top0 = 0.0
top1 = 0.19076305220883535
top2 = 0.24624423620407557
top3 = 0.2771828052952551
top4 = 0.3017997917596311
top5 = 0.33526699390153203
top6 = 0.4069611780455154
top7 = 0.5925925925925926
top8 = 1.0


In [13]:
print('##########################################################')

##########################################################


In [14]:
clf = RidgeClassifier().fit(xs, ys)

In [15]:
clf.predict(xs)

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
clf = LinearSVR().fit(xs, ys)
clf.predict(xs)

array([-4.38046265e-07, -6.60178538e-07, -3.97882084e-07, ...,
       -1.51166801e-06, -4.26576138e-07, -5.22718748e-07])