In [84]:
import re
import json
import yaml
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from gensim.models import Word2Vec
from collections import defaultdict
from catboost import CatBoostRegressor, Pool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error as MSE, r2_score 

In [85]:
df = pq.read_table(source='D:/mashob/data/lemmatized_texts.parquet').to_pandas()

In [86]:
df.head(3)

Unnamed: 0,id,title,text_markdown,timestamp,author_id,username,rating,pluses,minuses,url,tags
7,2936217,рассада,аня волос нос расти петя аня мочь жопа рассада...,1419527068,453525,Misochkin,-9,5,14,https://pikabu.ru/story/rassada_2936217,"[Петр, Анна, Рассада, Волосы, Текст, Переписка..."
11,6991412,начальник обед,недавно родственница жаловаться дурдом работа ...,1571215335,1699826,FluffyMagpie,641,681,40,https://pikabu.ru/story/pro_nachalnikov_i_obed...,"[Начальник, Руководитель, Обед, Еда, Сотрудник..."
15,6991359,помогать пожалуйста поиск игра,добрый время сутки господин дама подсказывать ...,1571214218,878346,Keeeesha,-8,2,10,https://pikabu.ru/story/pomogite_pozhaluysta_p...,"[Игры, Мобильное приложение, Гонки, Без рейтин..."


## Create function for calculate wilson score for rating

In [87]:
def wilson_score(pluses: int, minuses: int, eps: float = 1e-7) -> float:
    n = pluses + minuses
    if n == 0:
        return 0
    p = pluses / (n + eps)
    return _wilson_score(p, n)

def _wilson_score(p: int, n: int) -> float:
    q = 1.96
    return (p + q**2 / (2*n) - q*np.sqrt((p*(1 - p) + q**2/(4*n)) / n)) / (1 + q**2/n)

In [88]:
df['rating'] = df.apply(lambda row: wilson_score(row['pluses'], row['minuses']), axis=1)

In [89]:
with open('D:/mashob/data/indexes.json', 'r') as idx_file:   
    idx_data = json.load(idx_file) 

## Read file with model`s configuration

In [276]:
with open("D:/mashob/data/finally/configs/text.yaml", "r") as f:
    config_for_text_data = yaml.safe_load(f)

with open("D:/mashob/data/finally/configs/embeddings.yaml", "r") as f:
    config_for_embedding_data = yaml.safe_load(f)

with open("D:/mashob/data/finally/configs/rubert_embeddings.yaml", "r") as f:
    config_for_rubert_data = yaml.safe_load(f)

with open("D:/mashob/data/finally/configs/rubert_tun_embeddings.yaml", "r") as f:
    config_for_rubert_tun_data = yaml.safe_load(f)

In [90]:
train_indexes = idx_data['train']
val_indexes = idx_data['val']
test_indexes = idx_data['test']

train_data = df[df['id'].isin(train_indexes)]
val_data = df[df['id'].isin(val_indexes)]
test_data = df[df['id'].isin(test_indexes)]

In [91]:
len(train_data) + len(val_data) + len(test_data) == len(df)

True

In [92]:
X_train_text = train_data.text_markdown.values
X_val_text = val_data.text_markdown.values
X_test_text = test_data.text_markdown.values
y_train = train_data.rating.values
y_val = val_data.rating.values
y_test = test_data.rating.values

## Catboost on a text features

In [168]:
def calc_metrics(model, X, Y):
    y_pred = model.predict(X)
    rmse = MSE(Y, y_pred)**0.5
    r2 = r2_score(Y, y_pred)
    print("RMSE: %.7f" % rmse)
    print("R^2: %.7f" % r2)
    return rmse, r2

In [156]:
catboost_model = CatBoostRegressor(iterations=30000,
                          learning_rate=0.03,
                          random_state=42,
                          depth=4,
                          bootstrap_type='Bayesian',
                          bagging_temperature=10,
                          l2_leaf_reg=14)

In [157]:
catboost_model.fit(X_train_text, y_train, eval_set=(X_val_text, y_val), text_features=[0], verbose=100)

0:	learn: 0.2664447	test: 0.2706870	best: 0.2706870 (0)	total: 46.2ms	remaining: 23m 4s
100:	learn: 0.2536942	test: 0.2592273	best: 0.2592273 (100)	total: 4.83s	remaining: 23m 49s
200:	learn: 0.2508607	test: 0.2568988	best: 0.2568988 (200)	total: 9.45s	remaining: 23m 21s
300:	learn: 0.2490778	test: 0.2553210	best: 0.2553210 (300)	total: 14.1s	remaining: 23m 11s
400:	learn: 0.2476774	test: 0.2540544	best: 0.2540544 (400)	total: 18.8s	remaining: 23m 9s
500:	learn: 0.2465739	test: 0.2531460	best: 0.2531460 (500)	total: 23.5s	remaining: 23m 5s
600:	learn: 0.2457023	test: 0.2523834	best: 0.2523834 (600)	total: 28.4s	remaining: 23m 6s
700:	learn: 0.2448567	test: 0.2516633	best: 0.2516633 (700)	total: 33.1s	remaining: 23m 1s
800:	learn: 0.2441148	test: 0.2509719	best: 0.2509719 (800)	total: 37.8s	remaining: 22m 58s
900:	learn: 0.2434770	test: 0.2504444	best: 0.2504444 (900)	total: 42.5s	remaining: 22m 51s
1000:	learn: 0.2428766	test: 0.2499523	best: 0.2499523 (1000)	total: 47.3s	remaining: 22

<catboost.core.CatBoostRegressor at 0x1c8ead06230>

In [173]:
catboost_model.save_model('D:/mashob/data/finally/models/catboost_text.cbm')
catboost_new = CatBoostRegressor()

catboost_new.load_model('D:/mashob/data/finally/models/catboost_text.cbm')

<catboost.core.CatBoostRegressor at 0x1c8e86f2fb0>

In [169]:
pool_test = Pool(
    data=X_test_text,
    text_features=[0]
)

In [174]:
rmse_catboost_text, r2_catboost_text = calc_metrics(catboost_new, pool_test, y_test)

RMSE: 0.2353024
R^2: 0.2137780


## Catboost on embeddings: tfidf

In [175]:
v = TfidfVectorizer(norm=None, max_df=0.8, max_features=500, decode_error='replace')
X_train_vector = v.fit_transform(X_train_text)
X_val_vector = v.transform(X_val_text)
X_test_vector = v.transform(X_test_text)

In [176]:
with open("D:/mashob/data/finally/configs/embeddings.yaml", "r") as f:
    config = yaml.safe_load(f)

In [177]:
catboost_model = CatBoostRegressor(**config)

In [178]:
catboost_model.fit(X_train_vector, y_train, eval_set=(X_val_vector, y_val), verbose=1000)

0:	learn: 0.2663754	test: 0.2706432	best: 0.2706432 (0)	total: 81.1ms	remaining: 20m 16s
1000:	learn: 0.2372125	test: 0.2482096	best: 0.2482096 (1000)	total: 39.4s	remaining: 9m 10s
2000:	learn: 0.2308817	test: 0.2460919	best: 0.2460878 (1999)	total: 1m 26s	remaining: 9m 21s
3000:	learn: 0.2263043	test: 0.2454133	best: 0.2454133 (3000)	total: 2m 8s	remaining: 8m 33s
4000:	learn: 0.2224425	test: 0.2449212	best: 0.2449177 (3995)	total: 2m 54s	remaining: 8m
5000:	learn: 0.2191260	test: 0.2446313	best: 0.2446242 (4991)	total: 3m 37s	remaining: 7m 14s
6000:	learn: 0.2161220	test: 0.2444162	best: 0.2444162 (6000)	total: 4m 24s	remaining: 6m 36s
7000:	learn: 0.2134660	test: 0.2443732	best: 0.2443526 (6840)	total: 5m 4s	remaining: 5m 48s
8000:	learn: 0.2109797	test: 0.2442563	best: 0.2442548 (7998)	total: 5m 44s	remaining: 5m 1s
9000:	learn: 0.2086868	test: 0.2442050	best: 0.2441905 (8655)	total: 6m 24s	remaining: 4m 16s
10000:	learn: 0.2065675	test: 0.2441855	best: 0.2441736 (9183)	total: 7m 

<catboost.core.CatBoostRegressor at 0x1c8eabe60e0>

In [179]:
catboost_model.save_model('D:/mashob/data/finally/models/catboost_tfidf.cbm')
catboost_tfidf = CatBoostRegressor()

catboost_tfidf.load_model('D:/mashob/data/finally/models/catboost_tfidf.cbm')

<catboost.core.CatBoostRegressor at 0x1c8eacd2470>

In [180]:
rmse_catboost_tfidf, r2_catboost_tfidf = calc_metrics(catboost_tfidf, X_test_vector, y_test)

RMSE: 0.2376420
R^2: 0.1980659


## Catboost on embeddings: w2v + tfidf

In [182]:
w2v_model = Word2Vec(min_count=10, window=2, vector_size=300, negative=10, alpha=0.03, min_alpha=0.0007, sample=6e-5, sg=1)

In [183]:
with open("D:/mashob/data/finally/stopwords_ru.txt", "r") as file:
    stop_words = [line.strip() for line in file.readlines()] + ['тот', 'который', 'которая']

In [184]:
def get_vocab(text: pd.core.series.Series) -> list:
    words = re.findall(r'\w+', text.lower())
    filtered_words = [word for word in words if not re.match(r'^[a-zA-Z]+$', word) and word not in stop_words]
    return filtered_words

In [185]:
train_words = train_data.text_markdown.apply(get_vocab)
val_words = val_data.text_markdown.apply(get_vocab)
test_words = test_data.text_markdown.apply(get_vocab)

In [186]:
train_words

7        [аня, волос, нос, расти, петя, аня, жопа, расс...
11       [родственница, жаловаться, дурдом, трудиться, ...
15       [добрый, сутки, господин, дама, подсказывать, ...
23       [негр, ходить, больница, читать, рэп, едит, ин...
37       [ехать, довезти, девчонка, школа, оставаться, ...
                               ...                        
22982                          [неужели, плохо, дискотека]
22983    [ставить, копейка, обида, школьный, учитель, т...
22986    [слушать, история, клиент, магазин, модный, од...
22987    [интернет, проходить, конкурс, машинный, обуче...
22998    [слышать, мероприятие, название, одесса, прохо...
Name: text_markdown, Length: 55079, dtype: object

In [187]:
w2v_model.build_vocab(train_words)

In [188]:
w2v_model.train(train_words, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(116124293, 158063610)

In [189]:
w2v_model.wv.most_similar(positive=["политика"])

[('идеология', 0.4198024272918701),
 ('государство', 0.41625142097473145),
 ('фискальный', 0.4047982096672058),
 ('украина', 0.39585280418395996),
 ('еврейство', 0.3953686058521271),
 ('экономика', 0.3941681385040283),
 ('президент', 0.38690540194511414),
 ('антиконституционный', 0.38510265946388245),
 ('верховенство', 0.3723835051059723),
 ('федеративный', 0.3719206154346466)]

In [190]:
w2v = dict(zip(w2v_model.wv.index_to_key, w2v_model.wv))

In [191]:
class tfidf_vectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(w2v.values())))

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [192]:
train_w2v = tfidf_vectorizer(w2v).fit(train_words).transform(train_words)
val_w2v = tfidf_vectorizer(w2v).fit(val_words).transform(val_words)
test_w2v = tfidf_vectorizer(w2v).fit(test_words).transform(test_words)

In [193]:
train_w2v.shape[0] + val_w2v.shape[0] + test_w2v.shape[0] == len(df.text_markdown)

True

In [195]:
catboost_model = CatBoostRegressor(**config_for_embedding_data)

In [196]:
catboost_model.fit(train_w2v, y_train, eval_set=(val_w2v, y_val), verbose=1000)

0:	learn: 0.2665078	test: 0.2707271	best: 0.2707271 (0)	total: 112ms	remaining: 27m 53s
1000:	learn: 0.2315876	test: 0.2428986	best: 0.2428964 (999)	total: 1m 8s	remaining: 16m 3s
2000:	learn: 0.2234011	test: 0.2413181	best: 0.2413116 (1999)	total: 2m 31s	remaining: 16m 26s
3000:	learn: 0.2163662	test: 0.2403735	best: 0.2403735 (3000)	total: 3m 49s	remaining: 15m 18s
4000:	learn: 0.2100742	test: 0.2398238	best: 0.2398213 (3999)	total: 5m 5s	remaining: 13m 59s
5000:	learn: 0.2041504	test: 0.2394083	best: 0.2393978 (4990)	total: 6m 19s	remaining: 12m 38s
6000:	learn: 0.1986777	test: 0.2391656	best: 0.2391549 (5945)	total: 7m 30s	remaining: 11m 15s
7000:	learn: 0.1934250	test: 0.2390240	best: 0.2389988 (6960)	total: 8m 42s	remaining: 9m 56s
8000:	learn: 0.1884328	test: 0.2389584	best: 0.2389289 (7683)	total: 9m 52s	remaining: 8m 38s
9000:	learn: 0.1836621	test: 0.2389389	best: 0.2389233 (8437)	total: 11m 2s	remaining: 7m 21s
10000:	learn: 0.1791326	test: 0.2388998	best: 0.2388722 (9873)	t

<catboost.core.CatBoostRegressor at 0x1c8ead04b80>

In [199]:
catboost_model.save_model('D:/mashob/data/finally/models/catboost_w2v_tfidf.cbm')
catboost_w2v_tfidf = CatBoostRegressor()

catboost_w2v_tfidf.load_model('D:/mashob/data/finally/models/catboost_w2v_tfidf.cbm')

<catboost.core.CatBoostRegressor at 0x1c8ead040a0>

In [200]:
rmse_catboost_w2v_tfidf, r2_catboost_w2v_tfidf = calc_metrics(catboost_w2v_tfidf, test_w2v, y_test)

RMSE: 0.2342041
R^2: 0.2211007


# Catboost on embeddings: rubert-tiny

### We have datasets: rubert, tuned rubert and for each of them two more datasets with a title added to the text.

1. Split id on train, val and test from file

In [246]:
with open('D:/mashob/data/finally/indexes.json', 'r') as idx_file:   
    idx_data = json.load(idx_file) 

train_indexes = idx_data['train']
val_indexes = idx_data['val']
test_indexes = idx_data['test']

In [247]:
def create_matrix_embedding(df: pd.DataFrame):
    embedding = np.vstack(df['embedding'].values)
    return embedding

2. Create function for prepare data. Add to data the calculated rating by wilson score from original datasets. <br> Split data on train, val and test by id. Create matrix embedding and return ready data.

In [248]:
def prepare_data(name_file):

    df_rubert = pq.read_table(source=f'D:/mashob/data/finally/{name_file}.parquet').to_pandas()
    print(df_rubert.head(3))

    df_rubert = df_rubert.merge(df[['id', 'rating']], on='id', how='inner')

    train_data_rubert = df_rubert[df_rubert['id'].isin(train_indexes)]
    val_data_rubert = df_rubert[df_rubert['id'].isin(val_indexes)]
    test_data_rubert = df_rubert[df_rubert['id'].isin(test_indexes)]

    if len(train_data_rubert) + len(val_data_rubert) + len(test_data_rubert) == len(df_rubert):

        X_train = create_matrix_embedding(train_data_rubert)
        X_val = create_matrix_embedding(val_data_rubert)
        X_test = create_matrix_embedding(test_data_rubert)

        y_train = train_data_rubert.rating.values
        y_val = val_data_rubert.rating.values
        y_test = test_data_rubert.rating.values
        
    return X_train, X_val, X_test, y_train, y_val, y_test

3. Use function for prepare data

In [249]:
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data('texts')

        id                                          embedding
0  2936217  [0.3411005, -0.16877297, -0.3599054, 0.0115052...
1  6991412  [0.3696494, 0.06409113, -0.62138826, -0.890618...
2  6991359  [0.3278318, 0.08586374, -0.7452521, -0.2529353...


4. Train model

In [229]:
catboost_model = CatBoostRegressor(**config_for_rubert_data)
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=1000)

0:	learn: 0.2662915	test: 0.2705101	best: 0.2705101 (0)	total: 206ms	remaining: 44m 35s
1000:	learn: 0.2281858	test: 0.2385403	best: 0.2385403 (1000)	total: 1m 11s	remaining: 14m 11s
2000:	learn: 0.2200993	test: 0.2361675	best: 0.2361674 (1999)	total: 2m 17s	remaining: 12m 33s
3000:	learn: 0.2134371	test: 0.2350909	best: 0.2350909 (3000)	total: 3m 24s	remaining: 11m 22s
4000:	learn: 0.2073882	test: 0.2344464	best: 0.2344433 (3998)	total: 4m 29s	remaining: 10m 7s
5000:	learn: 0.2016515	test: 0.2339888	best: 0.2339789 (4978)	total: 5m 31s	remaining: 8m 50s
6000:	learn: 0.1963580	test: 0.2337940	best: 0.2337940 (6000)	total: 6m 32s	remaining: 7m 38s
7000:	learn: 0.1913657	test: 0.2336992	best: 0.2336713 (6917)	total: 7m 33s	remaining: 6m 28s
8000:	learn: 0.1865756	test: 0.2336528	best: 0.2336234 (7840)	total: 8m 35s	remaining: 5m 22s
9000:	learn: 0.1820150	test: 0.2335520	best: 0.2335441 (8746)	total: 9m 39s	remaining: 4m 17s
10000:	learn: 0.1776425	test: 0.2334987	best: 0.2334979 (9998)	

<catboost.core.CatBoostRegressor at 0x1c8eabe6410>

5. Save model

In [293]:
catboost_model.save_model('D:/mashob/data/finally/catboost_rubert.cbm')
catboost_rubert = CatBoostRegressor()

catboost_rubert.load_model('D:/mashob/data/finally/catboost_rubert.cbm')

<catboost.core.CatBoostRegressor at 0x1c8b6e53b20>

6. Calculate RMSE and $R^2$

In [234]:
rmse_catboost_rubert, r2_catboost_rubert = calc_metrics(catboost_rubert, X_test, y_test)

RMSE: 0.2296836
R^2: 0.2508781


7. Use model for tuned rubert with added title to text

In [277]:
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data('tuned_texts')

        id                                          embedding
0  2936217  [0.3378186, -0.19202456, -0.37366503, -0.01451...
1  6991412  [0.3538596, 0.044930458, -0.6553651, -0.905943...
2  6991359  [0.41649202, 0.07350595, -0.55431473, -0.29970...


In [256]:
catboost_model = CatBoostRegressor(**config_for_rubert_tun_data)
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=1000)

0:	learn: 0.2662134	test: 0.2704728	best: 0.2704728 (0)	total: 264ms	remaining: 1h 27m 53s
1000:	learn: 0.2279798	test: 0.2380441	best: 0.2380441 (1000)	total: 1m 6s	remaining: 21m 7s
2000:	learn: 0.2197308	test: 0.2355799	best: 0.2355766 (1999)	total: 2m 8s	remaining: 19m 15s
3000:	learn: 0.2129202	test: 0.2345093	best: 0.2345076 (2997)	total: 3m 18s	remaining: 18m 45s
4000:	learn: 0.2068642	test: 0.2337031	best: 0.2337014 (3999)	total: 4m 27s	remaining: 17m 50s
5000:	learn: 0.2012236	test: 0.2331414	best: 0.2331377 (4994)	total: 5m 38s	remaining: 16m 55s
6000:	learn: 0.1958673	test: 0.2327187	best: 0.2327161 (5997)	total: 6m 45s	remaining: 15m 44s
7000:	learn: 0.1908673	test: 0.2324538	best: 0.2324505 (6979)	total: 7m 50s	remaining: 14m 34s
8000:	learn: 0.1860441	test: 0.2322340	best: 0.2322295 (7986)	total: 8m 55s	remaining: 13m 23s
9000:	learn: 0.1813579	test: 0.2320645	best: 0.2320569 (8949)	total: 10m 4s	remaining: 12m 18s
10000:	learn: 0.1769201	test: 0.2319746	best: 0.2319488 (

<catboost.core.CatBoostRegressor at 0x1c8b6e534c0>

In [281]:
catboost_model.save_model('D:/mashob/data/finally/models/catboost_rubert_tuned_title.cbm')

In [285]:
catboost_rubert_tuned_title = CatBoostRegressor()

catboost_rubert_tuned_title.load_model('D:/mashob/data/finally/models/catboost_rubert_tuned_title.cbm')

<catboost.core.CatBoostRegressor at 0x1c8b6e538e0>

In [286]:
rmse_catboost_rubert_tuned_title, r2_catboost_rubert_tuned_title = calc_metrics(catboost_rubert_tuned_title, X_test, y_test)

RMSE: 0.2293415
R^2: 0.2531083


8. Use model for rubert with added title to text

In [262]:
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data('with_title_texts')

        id                                          embedding
0  2936217  [0.3378186, -0.19202456, -0.37366503, -0.01451...
1  6991412  [0.3538596, 0.044930458, -0.6553651, -0.905943...
2  6991359  [0.41649202, 0.07350595, -0.55431473, -0.29970...


In [263]:
catboost_model = CatBoostRegressor(**config_for_rubert_data)
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=1000)

0:	learn: 0.2662134	test: 0.2704728	best: 0.2704728 (0)	total: 215ms	remaining: 46m 29s
1000:	learn: 0.2279798	test: 0.2380441	best: 0.2380441 (1000)	total: 1m 12s	remaining: 14m 34s
2000:	learn: 0.2197308	test: 0.2355799	best: 0.2355766 (1999)	total: 2m 31s	remaining: 13m 50s
3000:	learn: 0.2129202	test: 0.2345093	best: 0.2345076 (2997)	total: 3m 40s	remaining: 12m 15s
4000:	learn: 0.2068642	test: 0.2337031	best: 0.2337014 (3999)	total: 4m 51s	remaining: 10m 55s
5000:	learn: 0.2012236	test: 0.2331414	best: 0.2331377 (4994)	total: 6m 4s	remaining: 9m 43s
6000:	learn: 0.1958673	test: 0.2327187	best: 0.2327161 (5997)	total: 7m 18s	remaining: 8m 31s
7000:	learn: 0.1908673	test: 0.2324538	best: 0.2324505 (6979)	total: 8m 31s	remaining: 7m 17s
8000:	learn: 0.1860441	test: 0.2322340	best: 0.2322295 (7986)	total: 9m 52s	remaining: 6m 10s
9000:	learn: 0.1813579	test: 0.2320645	best: 0.2320569 (8949)	total: 11m 12s	remaining: 4m 58s
10000:	learn: 0.1769201	test: 0.2319746	best: 0.2319488 (9969)

<catboost.core.CatBoostRegressor at 0x1c8b6e510c0>

In [264]:
catboost_model.save_model('D:/mashob/data/finally/models/catboost_rubert_title.cbm')

In [265]:
catboost_rubert_title = CatBoostRegressor()

catboost_rubert_title.load_model('D:/mashob/data/finally/models/catboost_rubert_title.cbm')

<catboost.core.CatBoostRegressor at 0x1c8b6e501c0>

In [266]:
rmse_catboost_rubert_title, r2_catboost_rubert_title = calc_metrics(catboost_rubert_title, X_test, y_test)

RMSE: 0.2292746
R^2: 0.2535439


9. Use model for tuned rubert

In [290]:
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data('no_title_tuned_texts')

        id                                          embedding
0  2936217  [0.3411005, -0.16877297, -0.3599054, 0.0115052...
1  6991412  [0.3696494, 0.06409113, -0.62138826, -0.890618...
2  6991359  [0.3278318, 0.08586374, -0.7452521, -0.2529353...


In [270]:
catboost_model = CatBoostRegressor(**config_for_rubert_tun_data)
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=2000)

0:	learn: 0.2662915	test: 0.2705101	best: 0.2705101 (0)	total: 176ms	remaining: 1h 27m 55s
2000:	learn: 0.2200993	test: 0.2361675	best: 0.2361674 (1999)	total: 2m 14s	remaining: 31m 15s
4000:	learn: 0.2073882	test: 0.2344464	best: 0.2344433 (3998)	total: 4m 35s	remaining: 29m 46s
6000:	learn: 0.1963580	test: 0.2337940	best: 0.2337940 (6000)	total: 7m 1s	remaining: 28m 4s
8000:	learn: 0.1865756	test: 0.2336528	best: 0.2336234 (7840)	total: 9m 16s	remaining: 25m 31s
10000:	learn: 0.1776425	test: 0.2334987	best: 0.2334979 (9998)	total: 11m 28s	remaining: 22m 55s
12000:	learn: 0.1694866	test: 0.2333518	best: 0.2333282 (11791)	total: 13m 38s	remaining: 20m 27s
14000:	learn: 0.1618630	test: 0.2333874	best: 0.2333220 (13664)	total: 15m 58s	remaining: 18m 15s
16000:	learn: 0.1546864	test: 0.2333165	best: 0.2332670 (15226)	total: 18m 5s	remaining: 15m 49s
18000:	learn: 0.1480618	test: 0.2333018	best: 0.2332487 (17262)	total: 20m 10s	remaining: 13m 26s
20000:	learn: 0.1418139	test: 0.2333714	bes

<catboost.core.CatBoostRegressor at 0x1c8ead049d0>

In [271]:
catboost_model.save_model('D:/mashob/data/finally/models/catboost_rubert_tuned.cbm')

In [291]:
catboost_rubert_tuned = CatBoostRegressor()

catboost_rubert_tuned.load_model('D:/mashob/data/finally/models/catboost_rubert_tuned.cbm')

<catboost.core.CatBoostRegressor at 0x1c8b6e51a80>

In [273]:
rmse_catboost_rubert_tuned, r2_catboost_rubert_tuned = calc_metrics(catboost_rubert_tuned, X_test, y_test)

RMSE: 0.2299721
R^2: 0.2489953


## $R^2$

In [287]:
index = ['text', 'tfidf', 'w2v + tfidf', 'rubert', 'rubert with title', 'tuned rubert', 'tuned rubert with title']
data_table = {'RMSE': [rmse_catboost_text, rmse_catboost_tfidf, rmse_catboost_w2v_tfidf, rmse_catboost_rubert, rmse_catboost_rubert_title, rmse_catboost_rubert_tuned, rmse_catboost_rubert_tuned_title],
              'R^2': [r2_catboost_text, r2_catboost_tfidf, r2_catboost_w2v_tfidf, r2_catboost_rubert, r2_catboost_rubert_title, r2_catboost_rubert_tuned, r2_catboost_rubert_tuned_title]}

df_metrics = pd.DataFrame(data_table, index=index)
print(df_metrics)

                             RMSE       R^2
text                     0.235302  0.213778
tfidf                    0.237642  0.198066
w2v + tfidf              0.229811  0.250045
rubert                   0.229684  0.250878
rubert with title        0.229275  0.253544
tuned rubert             0.229972  0.248995
tuned rubert with title  0.229341  0.253108
