### In this notebook, I will try to predict the telegrams' citation rate (regression) based purely on telegram topics representations (Floret embeddings and CV) using classical ML algorithm (LightGBM)

### For the Neural Network approach see: 

In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, ParameterGrid
from sklearn.metrics import (make_scorer, mean_absolute_error, mean_squared_error, 
                             balanced_accuracy_score, classification_report, confusion_matrix) 


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
RS = 42

In [4]:
def get_report(y_true, y_pred):
    print("Test MSE: ", mean_squared_error(y_true, y_pred))
    print("Test MAE: ", mean_absolute_error(y_true, y_pred))

### First compare the 128D Floret and CountVectorizer encodings perforamnce on a default model

In [7]:
df_128 = pd.read_csv('topics_vectors/topics_floret_128.csv', index_col=0)
df_cv = pd.read_csv('topics_vectors/topics_cnt_vec.csv', index_col=0)
df_labels = pd.read_csv("../data/labels.csv", index_col=0)[['n_cited']]

In [8]:
# 128 D

df = df_labels.join(df_128)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.n_cited.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RS)

In [9]:
gbm = lgb.LGBMRegressor(random_state=RS)
gbm.fit(X_train,y_train)

In [10]:
y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  39.56370703274866
Train MAE:  2.974366656785518
Test MSE:  48.04687192410052
Test MAE:  3.206776573938018


In [11]:
# CV

df = df_labels.join(df_cv)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.n_cited.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RS)

In [12]:
gbm = lgb.LGBMRegressor(random_state=RS)
gbm.fit(X_train,y_train)

In [13]:
y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  47.79941778604331
Train MAE:  3.2819050256816955
Test MSE:  51.035016775301976
Test MAE:  3.3700080867083617


### Floret vectors performs much better, will continue with them

In [42]:
# 128

df = df_labels.join(df_128)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.n_cited.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RS)

In [15]:
NFOLDS = 4
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=RS)

In [16]:
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [18]:
model = lgb.LGBMRegressor()

In [19]:
parameters = {
    'n_estimators': [100, 250, 500, 1000],
    'max_depth': [2, 4, 6],
    'reg_alpha': [0.1, 1],
    'learning_rate': [0.001, 0.01, 0.1]
}

In [20]:
RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=1)

In [21]:
RSCV.fit(X_train, y_train)

Fitting 4 folds for each of 72 candidates, totalling 288 fits


In [22]:
best_estimator = RSCV.best_estimator_

In [23]:
y_pred = best_estimator.predict(X_test)  
y_train_pred = best_estimator.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  40.933300438174825
Train MAE:  3.01916993975341
Test MSE:  47.56850561325053
Test MAE:  3.2119297810468943


In [24]:
RSCV.best_params_

{'reg_alpha': 0.1, 'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.01}

In [25]:
lgbm = lgb.LGBMRegressor(**{'reg_alpha': 0.1, 'n_estimators': 3000, 'max_depth': 6, 'learning_rate': 0.01})

In [26]:
lgbm.fit(X_train,y_train)

In [27]:
y_pred = lgbm.predict(X_test)  
y_train_pred = lgbm.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  38.0032484085118
Train MAE:  2.8790254738711574
Test MSE:  48.9595446571688
Test MAE:  3.220120656669346


In [35]:
# limit the upper citations with 20 (upper 2.5 quantile)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RS)

In [44]:
y_train[y_train>20] = 20

In [45]:
RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=1)

In [46]:
RSCV.fit(X_train, y_train)

Fitting 4 folds for each of 72 candidates, totalling 288 fits


In [47]:
best_estimator = RSCV.best_estimator_
y_pred = best_estimator.predict(X_test)  
y_train_pred = best_estimator.predict(X_train)
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  13.960327971272303
Train MAE:  2.35528190192207
Test MSE:  50.299347371437094
Test MAE:  3.048260993188085


In [49]:
best_estimator = RSCV.best_estimator_
y_pred = best_estimator.predict(X_test).astype(int)  
y_train_pred = best_estimator.predict(X_train).astype(int) 
print("Train MSE: ", mean_squared_error(y_train, y_train_pred))
print("Train MAE: ", mean_absolute_error(y_train, y_train_pred))
get_report(y_test, y_pred)

Train MSE:  14.221796249267435
Train MAE:  2.19327505372143
Test MSE:  51.17005673170057
Test MAE:  2.887505188875052


In [48]:
RSCV.best_params_

{'reg_alpha': 0.1, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}

### Best results for topic-based regression predictions (limiting the upper ):

- Train MSE:  14.221796249267435
- Train MAE:  2.19327505372143
- Test MSE:  51.17005673170057
- Test MAE:  2.887505188875052
