In [1]:
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV


In [2]:
train = pd.read_csv('train.csv', index_col=0).sample(frac=1)
y_train = train['correct']
X_train = train.drop(columns=['name', 'firstChar', 'group', 'correct'])

test = pd.read_csv('test.csv', index_col=0).sample(frac=1)
y_test = test['correct']
X_test = test.drop(columns=['name', 'firstChar', 'group', 'correct'])

In [3]:
def show_rank_metrics(df_group: pd.DataFrame, df_proba: pd.DataFrame, df_true: pd.DataFrame):
    df_metric = pd.concat([df_group, df_proba, df_true], axis=1)
    df_metric = df_metric.sort_values(by=['group', 'proba', 'correct'], ascending=[True, False, True])
    print(df_metric.shape)
    positions = []
    cur_group = -1
    cur_pos = 1
    for row in df_metric.itertuples():
        cur_pos += 1
        if row.group != cur_group:
            cur_pos = 1
            cur_group = row.group
        if row.correct == 1:
            positions.append(cur_pos)
    print(f'\nmean = {np.mean(positions)}\n')

    count = [0] * (max(positions) + 1)
    for p in positions:
        count[p] += 1
    acc = 0
    sum_all = sum(count)
    for i, c in enumerate(count):
        acc += c
        print(f'top{i} = {acc / sum_all}')

In [4]:
# Initialize CatBoostClassifier
model = CatBoostClassifier(verbose=False, random_seed=0)
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7ff5eef8dd00>

In [5]:
#model.predict_proba(X_test)

In [9]:
show_rank_metrics(
    df_group=pd.DataFrame(data=train['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=model.predict_proba(X_train)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_train.tolist(), columns=['correct'])
)

(271557, 3)

mean = 1.9128691213999272

top0 = 0.0
top1 = 0.6564809597984953
top2 = 0.7733072614589203
top3 = 0.8503960494481821
top4 = 0.9056772611274981
top5 = 0.945017068239817
top6 = 0.971928545388261
top7 = 0.987803665528784
top8 = 0.996520067610115
top9 = 1.0


In [6]:
show_rank_metrics(
    df_group=pd.DataFrame(data=test['group'].tolist(), columns=['group']),
    df_proba=pd.DataFrame(data=model.predict_proba(X_test)[:,1], columns=['proba']),
    df_true=pd.DataFrame(data=y_test.tolist(), columns=['correct'])
)

(218764, 3)

mean = 4.780841454718327

top0 = 0.0
top1 = 0.4518659377228429
top2 = 0.5508676016163537
top3 = 0.6082719277394818
top4 = 0.6486807701449965
top5 = 0.6914666032802472
top6 = 0.7262895174708819
top7 = 0.7599239362966485
top8 = 0.7899928690278107
top9 = 0.8163774661278821
top10 = 0.8406227715711909
top11 = 0.8645115284050392
top12 = 0.885072498217257
top13 = 0.9032564772997386
top14 = 0.9204896600903256
top15 = 0.9364154979795579
top16 = 0.9506774423579748
top17 = 0.9619681483242215
top18 = 0.9711195626337057
top19 = 0.9792013311148087
top20 = 0.9853815070121227
top21 = 0.990729736154029
top22 = 0.993225576420252
top23 = 0.9953648680770145
top24 = 0.998098407416211
top25 = 0.9992869027810791
top26 = 1.0


In [12]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,name_Event,9.254786
1,referenceType_REFERENCE_,7.239060
2,name_Keys,7.038075
3,name_Util,6.866616
4,referenceType_TYPE,5.247700
...,...,...
407,insideStatement1_switch,0.000000
408,insideStatement1_ternary_else,0.000000
409,insideStatement1_ternary_then,0.000000
410,insideStatement1_while_body,0.000000


In [8]:
model.save_model("kek")