In [2]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

In [3]:
train_data = pq.read_table(source='D:/mashob/data/train.parquet').to_pandas()
val_data = pq.read_table(source='D:/mashob/data/val.parquet').to_pandas()
test_data = pq.read_table(source='D:/mashob/data/test.parquet').to_pandas()

In [4]:
train_data.head(2)

Unnamed: 0,text_markdown,embedding,wilson_score
0,"Знаете, когда тебе в лицо суют что-то с криком...","[0.56629515, 0.03759489, -0.25796056, -0.03476...",0.437376
1,Как на самом деле стало в Крыму с приходом Рос...,"[1.014961, -0.7582568, -0.18253209, 0.3129007,...",0.309303


In [5]:
def create_matrix_embedding(df: pd.DataFrame):
    embedding = np.empty((df.shape[0], df.embedding[0].shape[0]))
    for i in range(df.shape[0]):
        embedding[i] = df.embedding[i].reshape((-1, df.embedding[0].shape[0]))
    return np.vstack(embedding)

In [6]:
X_train = create_matrix_embedding(train_data)
X_val = create_matrix_embedding(val_data)
X_test = create_matrix_embedding(test_data)

In [7]:
y_train = train_data.wilson_score.values
y_val = val_data.wilson_score.values
y_test = test_data.wilson_score.values

## CatBoost

In [8]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error as MSE, r2_score 

In [9]:
catboost_model = CatBoostRegressor(iterations=1500,
                          learning_rate=0.07,
                          depth=6,
                          random_state=42)

catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=100)

0:	learn: 0.2473339	test: 0.2443510	best: 0.2443510 (0)	total: 246ms	remaining: 6m 8s
100:	learn: 0.2270892	test: 0.2297114	best: 0.2297114 (100)	total: 8.56s	remaining: 1m 58s
200:	learn: 0.2223822	test: 0.2285599	best: 0.2285551 (196)	total: 19.6s	remaining: 2m 6s
300:	learn: 0.2175624	test: 0.2280446	best: 0.2280446 (300)	total: 32.8s	remaining: 2m 10s
400:	learn: 0.2132408	test: 0.2276521	best: 0.2276521 (400)	total: 41.9s	remaining: 1m 54s
500:	learn: 0.2092716	test: 0.2273501	best: 0.2273161 (474)	total: 50.4s	remaining: 1m 40s
600:	learn: 0.2055920	test: 0.2272526	best: 0.2272167 (551)	total: 58.9s	remaining: 1m 28s
700:	learn: 0.2020240	test: 0.2271417	best: 0.2271202 (686)	total: 1m 7s	remaining: 1m 16s
800:	learn: 0.1986631	test: 0.2269732	best: 0.2269607 (783)	total: 1m 15s	remaining: 1m 6s
900:	learn: 0.1954262	test: 0.2269077	best: 0.2268569 (866)	total: 1m 24s	remaining: 56.2s
1000:	learn: 0.1924189	test: 0.2268091	best: 0.2267732 (944)	total: 1m 32s	remaining: 46.3s
1100

<catboost.core.CatBoostRegressor at 0x1f847032380>

In [10]:
y_pred = catboost_model.predict(X_test)
rmse = MSE(y_test, y_pred)**0.5
print("RMSE: %.2f" % rmse)
r2_cat_boost_bert = r2_score(y_test,y_pred)
print("R^2: ", r2_cat_boost_bert)

RMSE: 0.23
R^2:  0.165266540701002


## GradientBoosting

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

In [13]:
X = np.concatenate((X_train, X_val))
y = np.concatenate((y_train, y_val))

In [25]:
params = {'n_estimators':500,
          'max_depth':6,
          'criterion':'friedman_mse',
          'learning_rate':0.03,
          'min_samples_leaf':32,
          'min_samples_split':32,
          'random_state':42,
          }

In [26]:
gbr = GradientBoostingRegressor(**params)
gbr.fit(X, y)

In [27]:
y_pred = gbr.predict(X_test)
rmse = MSE(y_test,y_pred)**0.5
print("RMSE: %.2f" % rmse)
r2_grad_boost_bert = gbr.score(X_test, y_test)
print("R^2: ", r2_grad_boost_bert)

RMSE: 0.23
R^2:  0.16385889747928917


# R^2

In [28]:
index = ['with bert`s embeddings']
data_table = {'GradientBoosting': [r2_grad_boost_bert],
              'CatBoost': [r2_cat_boost_bert]}

df_r2 = pd.DataFrame(data_table, index=index)
print(df_r2)

                        GradientBoosting  CatBoost
with bert`s embeddings          0.163859  0.165267
