### In this notebook, I will try to predict the telegrams' citation rate (regression) based purely on NER model's extracted entities representations using classical ML algorithm (LightGBM)


In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, ParameterGrid
from sklearn.metrics import (make_scorer, mean_absolute_error, mean_squared_error, 
                             balanced_accuracy_score, classification_report, confusion_matrix) 


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
RS = 42

In [4]:
def get_report(y_true, y_pred):
    print("Test MSE: ", mean_squared_error(y_true, y_pred))
    print("Test MAE: ", mean_absolute_error(y_true, y_pred))

### Will try to train a default model using all available encodings (Floret 128 and 256 dim, count vectors). The best one will be send to fine-tune on

In [10]:
df_128 = pd.read_csv('ent_vectors/ent_floret_128.csv', index_col=0)
df_256 = pd.read_csv('ent_vectors/ent_floret_256.csv', index_col=0)
df_cv = pd.read_csv('ent_vectors/end_cnt_vec.csv', index_col=0)
df_labels = pd.read_csv("../data/labels.csv", index_col=0)[['n_cited']]

In [11]:
# 256
df = df_labels.join(df_256)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.n_cited.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RS)

In [12]:
gbm = lgb.LGBMRegressor(random_state=RS)
gbm.fit(X_train,y_train)

In [13]:
y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  34.15252835107547
Train MAE:  2.774126678436372
Test MSE:  50.79092175514041
Test MAE:  3.211562467567163


In [23]:
# 128

df = df_labels.join(df_128)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.n_cited.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RS)

In [24]:
gbm = lgb.LGBMRegressor(random_state=RS)
gbm.fit(X_train,y_train)

In [25]:
y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  34.853612502896176
Train MAE:  2.8069278769985213
Test MSE:  50.306867729260645
Test MAE:  3.215860080286636


In [20]:
# CV

df = df_labels.join(df_cv)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.n_cited.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RS)

In [21]:
gbm = lgb.LGBMRegressor(random_state=RS)
gbm.fit(X_train,y_train)

In [22]:
y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  44.44503492625727
Train MAE:  3.1052135227659563
Test MSE:  49.943153998184854
Test MAE:  3.224575566537416


### I want to continue with 128-D Floret vectors

In [46]:
# 128

df = df_labels.join(df_128)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.n_cited.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RS)

In [47]:
NFOLDS = 4
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=RS)

In [48]:
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [49]:
model = lgb.LGBMRegressor()

In [50]:
parameters = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'max_depth': [2, 4, 6, 60],
    'reg_alpha': [0.1, 1],
    'learning_rate': [0.001, 0.01, 0.1]
}

In [51]:
RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=1)

In [52]:
RSCV.fit(X_train, y_train)

Fitting 4 folds for each of 120 candidates, totalling 480 fits


In [53]:
best_estimator = RSCV.best_estimator_

In [54]:
y_pred = best_estimator.predict(X_test)  
y_train_pred = best_estimator.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  34.6213229721939
Train MAE:  2.800484625656543
Test MSE:  50.042788186246256
Test MAE:  3.196695637212494


In [55]:
RSCV.best_params_

{'reg_alpha': 1, 'n_estimators': 1000, 'max_depth': 60, 'learning_rate': 0.01}

In [56]:
# convert to class and see class metrics

cls_df = pd.DataFrame(y_test, columns=['true_val'])
cls_df['pred_val'] = y_pred.astype(np.int64)

cls_df['true_class'] = cls_df['true_val'].apply(lambda x: 0 if x<3 else 1 if 3<=x<14 else 2)
cls_df['pred_class'] = cls_df['pred_val'].apply(lambda x: 0 if x<3 else 1 if 3<=x<14 else 2)


print("Test accuracy: ", balanced_accuracy_score(cls_df['true_class'].values, cls_df['pred_class'].values))
print("Report: ")
print(classification_report(cls_df['true_class'].values, cls_df['pred_class'].values))
print("Confusion matrix:")
print(confusion_matrix(cls_df['true_class'].values, cls_df['pred_class'].values))

Test accuracy:  0.4798538775735823
Report: 
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      5451
           1       0.34      0.42      0.37      1396
           2       0.45      0.19      0.27       380

    accuracy                           0.71      7227
   macro avg       0.54      0.48      0.49      7227
weighted avg       0.73      0.71      0.72      7227

Confusion matrix:
[[4500  910   41]
 [ 757  589   50]
 [  54  253   73]]


In [57]:
# and to binary

cls_df['true_class'] = cls_df['true_val'].apply(lambda x: 0 if x<3 else 1)
cls_df['pred_class'] = cls_df['pred_val'].apply(lambda x: 0 if x<3 else 1)

In [58]:
print("Test accuracy: ", balanced_accuracy_score(cls_df['true_class'].values, cls_df['pred_class'].values))
print("Report: ")
print(classification_report(cls_df['true_class'].values, cls_df['pred_class'].values))
print("Confusion matrix:")
print(confusion_matrix(cls_df['true_class'].values, cls_df['pred_class'].values))

Test accuracy:  0.6844462273225345
Report: 
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      5451
           1       0.50      0.54      0.52      1776

    accuracy                           0.76      7227
   macro avg       0.68      0.68      0.68      7227
weighted avg       0.76      0.76      0.76      7227

Confusion matrix:
[[4500  951]
 [ 811  965]]


In [59]:
# limit the upper citations with 20 (upper 2.5 quantile)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RS)

In [61]:
y_train[y_train>20] = 20

In [62]:
parameters = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'max_depth': [2, 4, 6],
    'reg_alpha': [0.1, 1],
    'learning_rate': [0.001, 0.01, 0.1]
}

RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=1)

In [63]:
RSCV.fit(X_train, y_train)

Fitting 4 folds for each of 90 candidates, totalling 360 fits


In [64]:
best_estimator = RSCV.best_estimator_
y_pred = best_estimator.predict(X_test)  
y_train_pred = best_estimator.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  11.127527135015377
Train MAE:  2.0433333591878053
Test MSE:  51.60500583652851
Test MAE:  2.9866454857445386


In [66]:
best_estimator = RSCV.best_estimator_
y_pred = best_estimator.predict(X_test).astype(int) 
y_train_pred = best_estimator.predict(X_train).astype(int)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  11.389504786091033
Train MAE:  1.873290681773784
Test MSE:  52.47253355472534
Test MAE:  2.8209492182094924


In [67]:
RSCV.best_params_

{'reg_alpha': 1, 'n_estimators': 250, 'max_depth': 6, 'learning_rate': 0.1}

### Best results for NER-based regression predictions:

- Train MSE:  11.389504786091033
- Train MAE:  1.873290681773784
- Test MSE:  52.47253355472534
- Test MAE:  2.8209492182094924
