In [1]:
import numpy as np 
import pandas as pd
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from torch import cuda 
from copy import deepcopy
from sklearn.metrics import ndcg_score
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
raw_data = pd.read_json('/kaggle/input/vc-it-cup-ranking/train_preprocessed.json')
raw_data['post_id'] = np.arange(raw_data.shape[0])

In [5]:
raw_data.head()

Unnamed: 0,comments,score,posts,post_id
0,[go back school ident give founder go back sch...,"[0, 1, 2, 3, 4]",mani summer combin funde decid continu startup...,0
1,[curious head long run cbs tear fit imag tri e...,"[0, 1, 2, 3, 4]",cbs acquir lastfm 280m,1
2,[hate fals claim fiduciari respons public comp...,"[0, 1, 2, 3, 4]",costco becam antiwalmart,2
3,[real point simpl power order sound edgi subve...,"[0, 1, 2, 3, 4]",fortun favor big turd screw money art,3
4,"[look someon hasnt read mythic manmonth, chanc...","[0, 1, 2, 3, 4]",startupweekend 70 founder creat one compani we...,4


# Подбор подходящей модели

In [5]:
# Обычно использую seed = 42
train, val = train_test_split(raw_data, random_state=42)

In [6]:
del raw_data

In [7]:
train_df = train.explode(['score', 'comments'])
val_df = val.explode(['score', 'comments'])

In [8]:
del train, val

In [9]:
X_train = train_df.drop(['score', 'post_id'], axis=1)
y_train = train_df['score']
posts_id_train = train_df['post_id']

X_val = val_df.drop(['score', 'post_id'], axis=1)
y_val = val_df['score']
posts_id_val= val_df['post_id']

In [10]:
del train_df, val_df

In [11]:
X_train["text"] = X_train['posts'] + " " + X_train['comments']
X_val["text"] = X_val['posts'] + " " + X_val['comments']

In [12]:
X_train['sent_length'] = X_train['comments'].str.split().str.len()
X_val['sent_length'] = X_val['comments'].str.split().str.len()

# Получаем эмбеддинги

In [13]:
pip install -U sentence-transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [14]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")


KeyboardInterrupt: 

In [14]:
from torch.utils.data import DataLoader
from tqdm import tqdm 

In [15]:
train_dl = DataLoader(X_train['text'].values, batch_size=1000)

In [16]:
res = []
for batch in tqdm(train_dl):
    res.append(model.encode(batch, show_progress_bar=False))

100%|██████████| 331/331 [06:56<00:00,  1.26s/it]


In [17]:
train_emb = np.concatenate(res)

In [18]:
val_dl = DataLoader(X_val['text'].values, batch_size=1000)

In [19]:
res = []
for batch in tqdm(val_dl):
    res.append(model.encode(batch, show_progress_bar=False))

100%|██████████| 111/111 [02:15<00:00,  1.22s/it]


In [31]:
np.save('train_emb.npy', train_emb)

In [28]:
val_emb = np.concatenate(res)

In [32]:
np.save('val_emb.npy', val_emb)

In [15]:
train_emb = np.load("/kaggle/working/train_emb.npy")

In [16]:
val_emb = np.load("/kaggle/working/val_emb.npy")

In [17]:
val_emb.shape

(110135, 384)

In [18]:
X_train

Unnamed: 0,comments,posts,text,sent_length
53256,major thing hold back higher educ massiv disru...,schoolless revolut,schoolless revolut major thing hold back highe...,69
53256,biggest problem mooc find lack real engag stud...,schoolless revolut,schoolless revolut biggest problem mooc find l...,64
53256,someon colleg student whos skeptic free onlin ...,schoolless revolut,schoolless revolut someon colleg student whos ...,39
53256,abil doublespe video make mooc sens immedi lov...,schoolless revolut,schoolless revolut abil doublespe video make m...,29
53256,open free access highqual curriculum amaz fore...,schoolless revolut,schoolless revolut open free access highqual c...,144
...,...,...,...,...
15795,nice wonder differ forward googl voic sms 38 v...,notifo yc w10 add googl voic sms notif,notifo yc w10 add googl voic sms notif nice wo...,24
15795,voicemail transcript sent slight differ address,notifo yc w10 add googl voic sms notif,notifo yc w10 add googl voic sms notif voicema...,6
15795,servic top prowl exact manner quit glad servic...,notifo yc w10 add googl voic sms notif,notifo yc w10 add googl voic sms notif servic ...,16
15795,notifo logotyp remind old stall notif project ...,notifo yc w10 add googl voic sms notif,notifo yc w10 add googl voic sms notif notifo ...,12


In [19]:
X_val

Unnamed: 0,comments,posts,text,sent_length
34937,grey kill kennedi presid mankind final unit st...,post claim hacker news fix,post claim hacker news fix grey kill kennedi p...,35
34937,divert investor capit startup pick anoth start...,post claim hacker news fix,post claim hacker news fix divert investor cap...,81
34937,guess first question proof look utter bullshit...,post claim hacker news fix,post claim hacker news fix guess first questio...,19
34937,score adjac stori suggest anomali explain karm...,post claim hacker news fix,post claim hacker news fix score adjac stori s...,16
34937,,post claim hacker news fix,post claim hacker news fix,0
...,...,...,...,...
20812,30 activ dislik social situat requir wellworn ...,ask hn aq,ask hn aq 30 activ dislik social situat requir...,37
20812,score 26 anxious wreck hate talk love talk cra...,ask hn aq,ask hn aq score 26 anxious wreck hate talk lov...,35
20812,32 troubl social situat especi discern nonverb...,ask hn aq,ask hn aq 32 troubl social situat especi disce...,29
20812,got 30 remark seem annoy talk convers bother t...,ask hn aq,ask hn aq got 30 remark seem annoy talk conver...,20


In [24]:
# X_train = X_train.drop(['comments', 'posts'], axis=1)
# X_val = X_val.drop(['comments', 'posts'], axis=1)

In [20]:
max_target = np.max(y_train)
y_train /= max_target
y_val /= max_target

In [21]:
X_train_vals = np.hstack([X_train['sent_length'].values[:, np.newaxis], train_emb])
X_val_vals = np.hstack([X_val['sent_length'].values[:, np.newaxis], val_emb])

In [22]:
X_train_vals.shape

(330400, 385)

# Добавляем расстояния Яро-Винклера(Не используем)

In [59]:
jaro_winkler_distance = pd.read_csv("/kaggle/input/vc-it-cup-ranking/jaro_winkler_distances_train.csv")
jaro_winkler_distance

Unnamed: 0.1,Unnamed: 0,text,0,1,2,3,4
0,0,mani summer combin funde decid continu startup...,0.548492,0.639405,0.570749,0.620368,0.670622
1,1,cbs acquir lastfm 280m,0.583483,0.497695,0.509736,0.525455,0.484040
2,2,costco becam antiwalmart,0.573232,0.606162,0.542858,0.765909,0.532750
3,3,fortun favor big turd screw money art,0.521665,0.525623,0.561185,0.595107,0.657073
4,4,startupweekend 70 founder creat one compani we...,0.598498,0.549352,0.762029,0.628534,0.626611
...,...,...,...,...,...,...,...
88102,88102,dont upgrad io 801 may experi servic,0.521032,0.551713,0.514161,0.606363,0.622081
88103,88103,ask hn us hner get health insur,0.542282,0.574363,0.579493,0.503783,0.523242
88104,88104,justin gordon use react rail,0.542470,0.511519,0.614266,0.616408,0.565476
88105,88105,io 801 releas broken iphon 6 model withdrawn,0.520738,0.628396,0.553345,0.594541,0.658189


In [60]:
jwd = jaro_winkler_distance.drop(['text', 'Unnamed: 0'], axis=1).values.tolist()

In [61]:
jwd_train, jwd_val = train_test_split(jwd, random_state=42)

In [62]:
jwd_train_exp= pd.DataFrame()
jwd_train_exp['jwd'] = jwd_train
jwd_train_exp = jwd_train_exp.explode('jwd')

In [63]:
jwd_val_exp= pd.DataFrame()
jwd_val_exp['jwd'] = jwd_val
jwd_val_exp = jwd_val_exp.explode('jwd')

In [64]:
jwd_train_exp.shape, jwd_val_exp.shape

((330400, 1), (110135, 1))

In [65]:
X_train_vals.shape, X_val_vals.shape

((330400, 386), (110135, 385))

In [66]:
jwd_train_array = jwd_train_exp['jwd'].values
jwd_val_array = jwd_val_exp['jwd'].values

In [67]:
jwd_train_array = jwd_train_array[:, np.newaxis]

In [68]:
jwd_val_array = jwd_val_array[:, np.newaxis]

In [69]:
X_train_vals = np.hstack([jwd_train_array, X_train_vals])


In [52]:
X_train_vals.shape

(330400, 386)

In [76]:
X_val_vals = np.hstack([jwd_val_array ,X_val_vals])

In [78]:
X_val_vals.shape

(110135, 386)

# Добавляем анализ сентимента

In [25]:
train_sent_df = pd.read_csv('/kaggle/input/vc-it-cup-ranking/train_sent_df.csv').drop('Unnamed: 0', axis=1)
train_sent_df.head()

Unnamed: 0,0,1,2,3,4
0,Irrelevant,Irrelevant,Irrelevant,Positive,Neutral
1,Positive,Negative,Positive,Negative,Irrelevant
2,Negative,Neutral,Negative,Irrelevant,Irrelevant
3,Irrelevant,Neutral,Negative,Irrelevant,Positive
4,Neutral,Irrelevant,Negative,Neutral,Irrelevant


In [28]:
train_sent, val_sent = train_test_split(train_sent_df, random_state=42)

In [29]:
del train_sent_df

In [32]:
train_sent_exp = pd.DataFrame()
train_sent_exp[0] = train_sent.values.tolist()

In [33]:
val_sent_exp = pd.DataFrame()
val_sent_exp[0] = val_sent.values.tolist()

In [36]:
train_sent_exp = train_sent_exp.explode(0)
val_sent_exp = val_sent_exp.explode(0) 

In [37]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
train_sent_enc = encoder.fit_transform(train_sent_exp)

In [40]:
val_sent_enc = encoder.transform(val_sent_exp)

In [41]:
val_sent_enc.shape

(110135, 4)

In [42]:
del train_sent_exp, val_sent_exp, encoder

In [47]:
train_sent_enc.shape, X_train_vals.shape

((330400, 4), (330400, 385))

In [54]:
train_sent_enc[:5].todense()

matrix([[0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.]])

In [55]:
X_train_vals = np.hstack([train_sent_enc.todense(), X_train_vals])

In [57]:
X_train_vals.shape

(330400, 389)

In [58]:
X_val_vals = np.hstack([val_sent_enc.todense(), X_val_vals])

In [59]:
X_val_vals.shape

(110135, 389)

In [67]:
X_train_vals = np.array(X_train_vals)

In [68]:
X_val_vals = np.array(X_val_vals)

In [73]:
del train_sent_enc, val_sent_enc

# Добавляем логиты классифаера

In [74]:
logits_train = np.load('/kaggle/input/vc-it-cup-ranking/logits_train.npy')
logits_val = np.load('/kaggle/input/vc-it-cup-ranking/logits_val.npy')

In [75]:
X_val_vals.shape, logits_val.shape

((110135, 389), (110134, 5))

In [76]:
X_train_vals.shape, logits_train.shape

((330400, 389), (330401, 5))

In [77]:
X_train_with_logits = np.hstack([X_train_vals, logits_train[:-1]])
X_val_with_logits = np.hstack([X_val_vals[:-1], logits_val])

In [78]:
posts_id_val = posts_id_val[:-1]

In [79]:
y_val = y_val[:-1]

(330400,)

In [82]:
Train_Pool = Pool(data=X_train_with_logits, label=y_train, group_id=posts_id_train)

Val_Pool = Pool(data=X_val_with_logits, label=y_val, group_id=posts_id_val)

In [83]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG'],
    'verbose': True,
    'random_seed': 42,
    "task_type":"GPU",
     "devices": '0:1',
    "learning_rate": 0.08,
    "early_stopping_rounds": 20,
    'loss_function': 'RMSE',
    "train_dir": 'RMSE'
}


In [84]:
model = CatBoostRanker(**default_parameters)
model.fit(Train_Pool, eval_set=Val_Pool,verbose=True)

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.3506969	test: 0.3506565	best: 0.3506565 (0)	total: 96.3ms	remaining: 3m 12s
1:	total: 223ms	remaining: 3m 42s
2:	total: 311ms	remaining: 3m 27s
3:	total: 395ms	remaining: 3m 17s
4:	total: 445ms	remaining: 2m 57s
5:	learn: 0.3415734	test: 0.3413845	best: 0.3413845 (5)	total: 511ms	remaining: 2m 49s
6:	total: 614ms	remaining: 2m 54s
7:	total: 703ms	remaining: 2m 55s
8:	total: 748ms	remaining: 2m 45s
9:	total: 831ms	remaining: 2m 45s
10:	learn: 0.3372217	test: 0.3369341	best: 0.3369341 (10)	total: 930ms	remaining: 2m 48s
11:	total: 990ms	remaining: 2m 44s
12:	total: 1.07s	remaining: 2m 44s
13:	total: 1.13s	remaining: 2m 40s
14:	total: 1.18s	remaining: 2m 36s
15:	learn: 0.3351252	test: 0.3347953	best: 0.3347953 (15)	total: 1.28s	remaining: 2m 38s
16:	total: 1.37s	remaining: 2m 39s
17:	total: 1.44s	remaining: 2m 39s
18:	total: 1.54s	remaining: 2m 40s
19:	total: 1.61s	remaining: 2m 39s
20:	learn: 0.3340849	test: 0.3337432	best: 0.3337432 (20)	total: 1.68s	remaining: 2m 37s
21:	to

<catboost.core.CatBoostRanker at 0x7fd8b8756690>

In [86]:
np.argsort(model.feature_importances_)

array([105, 273, 269,  38,  39, 159, 259, 160,  49, 110, 241, 237, 163,
        57, 227, 215,  67, 191, 188, 186,  32,  85, 183, 135, 390, 117,
       126,  96, 338, 136, 142,  91, 313,  16, 138, 392, 311, 154, 372,
       236, 344, 173,   8, 280, 290, 253, 233,  30, 151, 152, 257, 224,
        25, 379, 162, 171,  44, 130,  41, 314, 184, 297, 293, 281, 389,
       114, 187,  22, 264,  65, 267, 203, 204, 258, 180,  53, 393, 278,
       137, 249, 101, 354, 145, 321, 363, 309,  37, 340, 276, 291, 327,
        42, 235,  94,  47, 222, 206, 322,  10,   6, 246, 373,  66, 128,
       196, 268,  80, 298, 315, 189, 335, 265, 106, 294, 279,   7, 202,
       122, 229,   9, 355, 120,  98, 248, 384, 164,  18, 356,  23, 300,
        20, 208, 295,  45, 143, 205, 148, 347, 103,  52,  21, 210, 108,
        56,  43,  19, 350, 375, 378, 172, 318, 231, 157, 192,  15,   2,
       243, 127, 365, 252, 131, 262, 211,  78, 251, 100, 234,  74,  54,
       386,  87, 299, 385, 134, 168,  81, 245, 250, 177,  13, 23

In [85]:
model.get_best_score()

{'learn': {'RMSE': 0.32964294916335063},
 'validation': {'NDCG:type=Base': 0.8843397027844371,
  'RMSE': 0.33175650973140625}}