In [25]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import ARDRegression, RidgeClassifier
from sklearn.svm import LinearSVR



In [2]:
train = pd.read_csv('train.csv', index_col=0).sample(frac=1)
y_train = train['correct']
X_train = train.drop(columns=['name', 'firstChar', 'group', 'correct'])

test = pd.read_csv('test.csv', index_col=0).sample(frac=1)
y_test = test['correct']
X_test = test.drop(columns=['name', 'firstChar', 'group', 'correct'])

In [3]:
def show_rank_metrics(df_group: pd.DataFrame, df_proba: pd.DataFrame, df_true: pd.DataFrame):
    df_metric = pd.concat([df_group, df_proba, df_true], axis=1)
    df_metric = df_metric.sort_values(by=['group', 'proba', 'correct'], ascending=[True, False, True])
    print(df_metric.shape)
    positions = []
    cur_group = -1
    cur_pos = 1
    for row in df_metric.itertuples():
        cur_pos += 1
        if row.group != cur_group:
            cur_pos = 1
            cur_group = row.group
        if row.correct == 1:
            positions.append(cur_pos)
    print(f'\nmean = {np.mean(positions)}\n')

    count = [0] * 9
    for p in positions:
        count[p] += 1
    acc = 0
    sum_all = sum(count)
    for i, c in enumerate(count):
        acc += c
        print(f'top{i} = {acc / sum_all}')

In [4]:
s_tr = X_train.sum()
s_te = X_test.sum()

In [5]:
print(f'train median {s_tr.median() / X_train.shape[0] * 100} %')
print(f'test median {s_te.median() / X_test.shape[0] * 100} %')

train median 1.0424971210913279 %
test median 0.978129464776349 %


In [14]:
xs = X_train.iloc[:10 ** 5, :]
ys = y_train.iloc[:10 ** 5]

In [15]:
clf = ARDRegression(n_iter=10 ** 3).fit(xs, ys)

In [16]:
clf.predict(xs)

array([0.11698988, 0.13773321, 0.11698988, ..., 0.10375774, 0.11698988,
       0.11698988])

In [17]:
#metrics.plot_roc_curve(clf, X_test, y_test)

In [18]:
#metrics.plot_roc_curve(clf, xs, ys)

In [19]:
show_rank_metrics(
    df_group=pd.DataFrame(data=train['group'].iloc[:10 ** 5].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict(xs), columns=['proba']),
    df_true=pd.DataFrame(data=ys.tolist(), columns=['correct'])
)

(100000, 3)

mean = 2.6605284227381905

top0 = 0.0
top1 = 0.2945556445156125
top2 = 0.48895116092874297
top3 = 0.7103282626100881
top4 = 0.8831865492393915
top5 = 0.9685348278622898
top6 = 0.9944755804643715
top7 = 0.999439551641313
top8 = 1.0


In [20]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=clf.predict(X_test), columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(72792, 3)

mean = 5.737223870755028

top0 = 0.0
top1 = 0.18056929332893726
top2 = 0.2239806572150786
top3 = 0.2534344433454226
top4 = 0.2821189141663919
top5 = 0.32124409275744586
top6 = 0.4010330805583031
top7 = 0.6003956478733927
top8 = 1.0


In [None]:
print('##########################################################')

In [23]:
clf = RidgeClassifier().fit(xs, ys)

In [26]:
clf.predict(xs)

array([0, 0, 0, ..., 0, 0, 0])

In [27]:
clf = LinearSVR().fit(xs, ys)
clf.predict(xs)

array([-3.13309596e-07,  1.41723083e-08,  3.64635371e-08, ...,
        3.05629395e-07, -2.26621427e-08,  5.18936703e-09])