In [1]:
from catboost import CatBoostClassifier, CatBoostRegressor, FeaturesData
from sklearn.model_selection import train_test_split
from database.async_db import asyncHandler as db
from navec import Navec
import numpy as np

# 1. Sentence embedding

In [2]:
async def get_vector_data(id_t, id_f):
    t = [await db.get_vector_by_p_id(i) for i in id_t]
    f = [await db.get_vector_by_p_id(i) for i in id_f]
    return t + f, [1] * len(t) + [0] * len(f)

In [3]:
id_t = [48,115,170,873,2919,1364,380,8986,614,2391,3812,10810]

id_f = [9007,8915,11427,385,5451,9366,5079,3816,3955,2424,3593,49,174,2949,893,120,3843,1382]

x, y = await get_vector_data(id_t, id_f)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

cat = CatBoostClassifier(
    thread_count=8,
    iterations=50,
    random_seed=42,
    learning_rate=0.1,
    custom_loss=['AUC', 'Accuracy', 'PRAUC']
)

cat.fit(
   X_train, y_train,
   eval_set=(X_test, y_test),
   logging_level='Silent',
   plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2ad669c75e0>

# 2. all feaches

In [4]:
async def get_cat_data(id_t, id_f):
    out = await db.get_cat_data_by_list_id(id_t + id_f)
    return out, [1] * len(id_t) + [0] * len(id_f)

In [5]:
id_t = [48,115,170,873,2919,1364,380,8986,614,2391,3812,10810]

id_f = [9007,8915,11427,385,5451,9366,5079,3816,3955,2424,3593,49,174,2949,893,120,3843,1382]

x, y = await get_cat_data(id_t, id_f)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [6]:
cat = CatBoostClassifier(
    thread_count=8,
    iterations=50,
    learning_rate=0.1,
    custom_loss=['AUC', 'Accuracy', 'PRAUC']
)

cat.fit(
   X_train, y_train,
   eval_set=(X_test, y_test),
   embedding_features=[0,7],
   logging_level='Silent',
   plot=True
)

  result = asarray(a).shape


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2ad66e1e0e0>

In [7]:
await db.get_cat_data_by_list_id(id_t)


[[array([-1.14320554e-01, -5.17436564e-01,  5.98322414e-02,  9.47088078e-02,
          6.78423882e-01,  6.11604691e-01,  1.99079081e-01, -7.27132142e-01,
         -9.86254886e-02,  2.29392108e-02, -8.32904801e-02,  1.11664847e-01,
          2.59801596e-01,  1.68566015e-02, -2.98564136e-01,  1.33305918e-02,
         -4.09926414e-01, -3.22095662e-01, -2.09995243e-03, -3.97641100e-02,
          4.61944640e-01, -6.15133286e-01,  3.23900640e-01,  4.74772044e-02,
          5.59909008e-02, -2.66976301e-02,  3.09468001e-01,  1.80762067e-01,
         -9.69232976e-01,  4.01544482e-01,  1.49258208e-02,  4.06529486e-01,
         -1.33526877e-01,  3.13795656e-01, -4.89254147e-01,  5.23282945e-01,
         -8.25839564e-02, -4.90869999e-01, -1.29075766e-01, -1.93971872e-01,
         -9.01946843e-01,  4.99536574e-01, -7.50638127e-01, -1.72357425e-01,
          1.89083070e-01,  1.62604079e-01,  6.91436678e-02,  6.69249520e-02,
          1.49134412e-01, -1.04182102e-01, -5.19611835e-02,  5.29051423e-02,

In [21]:
predict = cat.predict(await db.get_cat_data_by_list_id(id_t))
predict

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [22]:
proba = cat.predict_proba(await db.get_cat_data_by_list_id(id_t))
proba

array([[0.28624402, 0.71375598],
       [0.35256826, 0.64743174],
       [0.3164677 , 0.6835323 ],
       [0.40835465, 0.59164535],
       [0.26011421, 0.73988579],
       [0.38101975, 0.61898025],
       [0.39407825, 0.60592175],
       [0.32260016, 0.67739984],
       [0.26391465, 0.73608535],
       [0.42493846, 0.57506154],
       [0.40954061, 0.59045939],
       [0.34763648, 0.65236352]])

In [23]:
for i in range(len(predict)):
    if predict[i] == 1:
        print(proba[i][1])

0.7137559753529478
0.6474317367215254
0.6835323002486959
0.5916453539327544
0.7398857871859266
0.6189802496566345
0.6059217545041125
0.6773998415053795
0.73608535311474
0.5750615409079621
0.5904593869358257
0.6523635151461498
