In [2]:
import numpy as np 
import pandas as pd
from catboost import CatBoostRanker, Pool
from torch import cuda 
import torch

In [3]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
raw_data = pd.read_json('/kaggle/input/vc-it-cup-ranking/train_preprocessed.json')
raw_data['post_id'] = np.arange(raw_data.shape[0])

In [5]:
raw_data.head()

Unnamed: 0,comments,score,posts,post_id
0,[go back school ident give founder go back sch...,"[0, 1, 2, 3, 4]",mani summer combin funde decid continu startup...,0
1,[curious head long run cbs tear fit imag tri e...,"[0, 1, 2, 3, 4]",cbs acquir lastfm 280m,1
2,[hate fals claim fiduciari respons public comp...,"[0, 1, 2, 3, 4]",costco becam antiwalmart,2
3,[real point simpl power order sound edgi subve...,"[0, 1, 2, 3, 4]",fortun favor big turd screw money art,3
4,"[look someon hasnt read mythic manmonth, chanc...","[0, 1, 2, 3, 4]",startupweekend 70 founder creat one compani we...,4


In [6]:
train_df = raw_data.explode(['comments', 'score'])

In [7]:
train_df.head()

Unnamed: 0,comments,score,posts,post_id
0,go back school ident give founder go back scho...,0,mani summer combin funde decid continu startup...,0
0,invari success set fall back origin path that ...,1,mani summer combin funde decid continu startup...,0
0,school connect go real world enter school thin...,2,mani summer combin funde decid continu startup...,0
0,guess depend hungri believ product im 24 still...,3,mani summer combin funde decid continu startup...,0
0,pollground decid go back school get combin fund,4,mani summer combin funde decid continu startup...,0


In [8]:
X_train = train_df.drop(['score', 'post_id'], axis=1)
y_train = train_df['score']
posts_id_train = train_df['post_id']

In [9]:
X_train["text"] = X_train['posts'] + " " + X_train['comments']

In [10]:
X_train['sent_length'] = X_train['comments'].str.split().str.len()

# Получаем эмбеддинги

In [15]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=16d39ceff5fe9f3ea5ad00e11d8e7dcb2c29215dfad468bc48a96532cc2b26d7
  Stored in directory: /root/.cache/pip/wheels/83/71/2b/40d17d21937fed496fb99145227eca8f20b4891240ff60c86f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0mNote: you may need to restart the kernel to use updated packages.


In [16]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [17]:
from torch.utils.data import DataLoader
from tqdm import tqdm 

In [18]:
train_dl = DataLoader(X_train['text'].values, batch_size=1000)

In [20]:
model = model.to(device)

In [21]:
res = []
for batch in tqdm(train_dl):
    res.append(model.encode(batch, show_progress_bar=False))

100%|██████████| 441/441 [08:40<00:00,  1.18s/it]


In [24]:
train_emb = np.concatenate(res)

In [27]:
np.save('train_emb.npy', train_emb)

In [11]:
train_emb = np.load('train_emb.npy')

In [12]:
max_target = np.max(y_train)
y_train /= max_target

In [13]:
X_train_features = np.hstack([X_train['sent_length'].values[:, np.newaxis], train_emb])

In [14]:
X_train_features.shape

(440535, 385)

# Добавляем фичи сентимента

In [15]:
train_sent_df = pd.read_csv('/kaggle/input/vc-it-cup-ranking/train_sent_df.csv').drop('Unnamed: 0', axis=1)
train_sent_df.head()

Unnamed: 0,0,1,2,3,4
0,Irrelevant,Irrelevant,Irrelevant,Positive,Neutral
1,Positive,Negative,Positive,Negative,Irrelevant
2,Negative,Neutral,Negative,Irrelevant,Irrelevant
3,Irrelevant,Neutral,Negative,Irrelevant,Positive
4,Neutral,Irrelevant,Negative,Neutral,Irrelevant


In [16]:
train_sent_exp = pd.DataFrame()
train_sent_exp[0] = train_sent_df.values.tolist()

In [17]:
train_sent_exp = train_sent_exp.explode(0)


In [18]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
train_sent_enc = encoder.fit_transform(train_sent_exp)

In [19]:
X_train_features= np.hstack([train_sent_enc.todense(), X_train_features])

In [20]:
X_train_features = np.array(X_train_features)

In [21]:
X_train_features.shape

(440535, 389)

In [22]:
np.save('X_train_features.npy', X_train_features)

# Обучение модели

In [27]:
Train_Pool = Pool(data=X_train_features, label=y_train, group_id=posts_id_train)

In [30]:
default_parameters = {
    'iterations': 1500,
    'verbose': True,
    'random_seed': 42,
    "task_type":"GPU",
     "devices": '0:1',
    "learning_rate": 0.08,
    'loss_function': 'RMSE',
    "train_dir": 'RMSE'
}


In [31]:
model = CatBoostRanker(**default_parameters)
model.fit(Train_Pool, verbose=True)

0:	learn: 0.3506837	total: 86.2ms	remaining: 2m 9s
1:	learn: 0.3482347	total: 182ms	remaining: 2m 15s
2:	learn: 0.3461713	total: 225ms	remaining: 1m 52s
3:	learn: 0.3443561	total: 294ms	remaining: 1m 49s
4:	learn: 0.3427726	total: 358ms	remaining: 1m 47s
5:	learn: 0.3414578	total: 407ms	remaining: 1m 41s
6:	learn: 0.3402966	total: 478ms	remaining: 1m 41s
7:	learn: 0.3392947	total: 548ms	remaining: 1m 42s
8:	learn: 0.3384321	total: 617ms	remaining: 1m 42s
9:	learn: 0.3377159	total: 682ms	remaining: 1m 41s
10:	learn: 0.3370970	total: 741ms	remaining: 1m 40s
11:	learn: 0.3365542	total: 807ms	remaining: 1m 40s
12:	learn: 0.3360846	total: 861ms	remaining: 1m 38s
13:	learn: 0.3356699	total: 935ms	remaining: 1m 39s
14:	learn: 0.3353175	total: 1.02s	remaining: 1m 40s
15:	learn: 0.3350011	total: 1.07s	remaining: 1m 39s
16:	learn: 0.3347406	total: 1.15s	remaining: 1m 40s
17:	learn: 0.3345079	total: 1.23s	remaining: 1m 41s
18:	learn: 0.3342907	total: 1.29s	remaining: 1m 40s
19:	learn: 0.3341124	t

<catboost.core.CatBoostRanker at 0x7fd2c5643f10>

In [33]:
model.save_model('final_model.cbm')

# Получение тест сета

In [35]:
raw_test = pd.read_json('/kaggle/input/vc-it-cup-ranking/test_preprocessed.json')
raw_test['post_id'] = np.arange(raw_test.shape[0])

In [36]:
test_df = raw_test.explode(['comments'])

In [37]:
X_test = test_df.drop(['post_id'], axis=1)
posts_id_test = test_df['post_id']

In [38]:
X_test["text"] = X_test['posts'] + " " + X_test['comments']

In [39]:
X_test['sent_length'] = X_test['comments'].str.split().str.len()

In [43]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=3f614a0aa55b0b18cc311ef5cb507001e9c42565c20f6b52cf982332812bd1c3
  Stored in directory: /root/.cache/pip/wheels/83/71/2b/40d17d21937fed496fb99145227eca8f20b4891240ff60c86f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

In [44]:
model_for_embeddings = SentenceTransformer("all-MiniLM-L6-v2")

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [46]:
from torch.utils.data import DataLoader


test_dl = DataLoader(X_test['text'].values, batch_size=1000)

In [48]:
model_for_embeddings = model_for_embeddings.to(device)

In [50]:
from tqdm import tqdm 

res = []
for batch in tqdm(test_dl):
    res.append(model_for_embeddings.encode(batch, show_progress_bar=False))

100%|██████████| 71/71 [01:42<00:00,  1.45s/it]


In [51]:
test_emb = np.concatenate(res)

In [52]:
np.save('test_emb.npy', test_emb)

In [53]:
X_test_features = np.hstack([X_test['sent_length'].values[:, np.newaxis], test_emb])

In [54]:
X_test_features.shape

(70020, 385)

# Добавим анализ сентемента в test set


In [55]:
test_sent_df = pd.read_csv('/kaggle/input/vc-it-cup-ranking/test_sent_df.csv').drop('Unnamed: 0', axis=1)
test_sent_df.head()

Unnamed: 0,0,1,2,3,4
0,Positive,Positive,Positive,Negative,Negative
1,Irrelevant,Negative,Neutral,Irrelevant,Positive
2,Neutral,Negative,Negative,Neutral,Neutral
3,Neutral,Neutral,Neutral,Neutral,Positive
4,Neutral,Neutral,Neutral,Negative,Irrelevant


In [62]:
test_sent_exp = pd.DataFrame()
test_sent_exp[0] = test_sent_df.values.tolist()

In [63]:
test_sent_exp = test_sent_exp.explode(0)


In [64]:
test_sent_enc = encoder.transform(test_sent_exp)

In [65]:
test_sent_enc.shape, X_test_features.shape

((70020, 4), (70020, 385))

In [66]:
X_test_features= np.hstack([test_sent_enc.todense(), X_test_features])

In [67]:
X_test_features = np.array(X_test_features)

In [69]:
X_test_features.shape

(70020, 389)

In [70]:
np.save('X_test_features.npy', X_test_features)

In [71]:
Test_Pool = Pool(data=X_test_features, group_id=posts_id_test)

In [72]:
preds = model.predict(Test_Pool)

In [76]:
test_df['preds'] = preds

In [78]:
test_df.head()

Unnamed: 0,comments,posts,post_id,preds
0,ix27m still wait stabil wifi ipad sith io 8 qu...,io 801 releas broken iphon 6 model withdrawn,0,0.63683
0,upgrad need restor optionclick quotupdatequot ...,io 801 releas broken iphon 6 model withdrawn,0,0.54977
0,upgrad short releas suffer consequ abl restor ...,io 801 releas broken iphon 6 model withdrawn,0,0.574822
0,lot pressur healthkit front big flagship io 8 ...,io 801 releas broken iphon 6 model withdrawn,0,0.564362
0,fix alreadi updat httpx2fx2fwwwimorecomx2fios8...,io 801 releas broken iphon 6 model withdrawn,0,0.584238


In [80]:
test_result = test_df.groupby(test_df.index).agg({'preds': list, 'comments': list})

In [81]:
test_result

Unnamed: 0,preds,comments
0,"[0.6368296105108406, 0.5497702587149895, 0.574...",[ix27m still wait stabil wifi ipad sith io 8 q...
1,"[0.6718332744434274, 0.30088086976545014, 0.40...",[employ itx27 better cheaper marketyou allow s...
2,"[0.6162384786570669, 0.38294022925767024, 0.57...",[donx27t understand drug develop public money ...
3,"[0.39678334236116797, 0.2785829999441134, 0.51...",[ix27m physicist imagin excit news excit possi...
4,"[0.6353928840587724, 0.5333344471635506, 0.520...",[someon doesnx27t io develop boggl mind guy bu...
...,...,...
13999,"[0.47446891936365887, 0.6167158743832744, 0.71...",[meanwhil us stubb mayor town alaska 18 yearsh...
14000,"[0.266371367756733, 0.46466024318624477, 0.387...",[radic idea mayb model intellectu properti wro...
14001,"[0.6854467751612496, 0.6150058946210777, 0.652...",[present indepth summari ix27d love read hear ...
14002,"[0.5610288868900568, 0.6786785216049509, 0.611...",[ok want quothearquot trippi neural network th...


In [83]:
test_result['posts'] = raw_data['posts']

In [85]:
test_result.to_csv('test_result.csv')