### In this notebook, I will try to predict the telegrams' citation class labels based purely on topics representations using classical ML algorithm (LightGBM)

In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import balanced_accuracy_score, f1_score, classification_report, confusion_matrix, make_scorer

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
RS = 42

In [4]:
def get_report(y_true, y_pred):
    print("Test accuracy: ", balanced_accuracy_score(y_true, y_pred))
    print("Report: ")
    print(classification_report(y_true, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))

### Will compare the default params performance using 128-d and CountVectors. The best performing by default option will be further fine-tuned. I will use `balanced` class weight instead of oversampling

In [5]:
df_128 = pd.read_csv('topics_vectors/topics_floret_128.csv', index_col=0)
df_cv = pd.read_csv('topics_vectors/topics_cnt_vec.csv', index_col=0)
df_labels = pd.read_csv("../data/labels.csv", index_col=0)[['citation_class']]

In [22]:
# 128 first

df = df_labels.join(df_128)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

In [23]:
gbm = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)
gbm.fit(X_train,y_train)

In [24]:
y_pred = gbm.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, gbm.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.662357100785251
Test accuracy:  0.5720383171746156
Report: 
              precision    recall  f1-score   support

           0       0.85      0.72      0.78      5500
           1       0.27      0.29      0.28      1353
           2       0.23      0.70      0.35       374

    accuracy                           0.64      7227
   macro avg       0.45      0.57      0.47      7227
weighted avg       0.71      0.64      0.67      7227

Confusion matrix:
[[3980  988  532]
 [ 624  395  334]
 [  58   54  262]]


In [9]:
# CV

df = df_labels.join(df_cv)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

In [10]:
gbm = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)
gbm.fit(X_train,y_train)

In [11]:
y_pred = gbm.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, gbm.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.5876217714100143
Test accuracy:  0.5566802023100444
Report: 
              precision    recall  f1-score   support

           0       0.85      0.69      0.76      5500
           1       0.28      0.29      0.28      1353
           2       0.19      0.69      0.30       374

    accuracy                           0.62      7227
   macro avg       0.44      0.56      0.45      7227
weighted avg       0.71      0.62      0.65      7227

Confusion matrix:
[[3822  957  721]
 [ 622  386  345]
 [  64   52  258]]


### Floret embeddings show better performance compared to CV vectors

In [34]:
df = df_labels.join(df_128)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

In [37]:
NFOLDS = 3
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=RS)
scorer = make_scorer(balanced_accuracy_score, greater_is_better=True)  
model = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)
parameters = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'max_depth': [2, 4, 6, -1],
    'reg_alpha': [0.1, 1],
    'learning_rate': [0.01, 0.1]
}
RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=1)

In [38]:
RSCV.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


In [39]:
best_estimator = RSCV.best_estimator_

y_pred = best_estimator.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, best_estimator.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.6000464094361511
Test accuracy:  0.5842913496514645
Report: 
              precision    recall  f1-score   support

           0       0.86      0.72      0.78      5500
           1       0.27      0.25      0.26      1353
           2       0.22      0.78      0.34       374

    accuracy                           0.63      7227
   macro avg       0.45      0.58      0.46      7227
weighted avg       0.71      0.63      0.66      7227

Confusion matrix:
[[3958  879  663]
 [ 609  338  406]
 [  50   31  293]]


In [40]:
RSCV.best_params_

{'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.01}

In [43]:
# to binary

In [44]:
y_pred = best_estimator.predict(X_test)
y_pred_train = best_estimator.predict(X_train)

y_pred[y_pred>1] = 1
y_pred_train[y_pred_train>1] = 1
y_train[y_train>1] = 1
y_test[y_test>1] = 1

print(f"Train accuracy: {balanced_accuracy_score(y_train, y_pred_train)}")
get_report(y_test, y_pred)

Train accuracy: 0.6820398588256951
Test accuracy:  0.6690248986682108
Report: 
              precision    recall  f1-score   support

           0       0.86      0.72      0.78      5500
           1       0.41      0.62      0.49      1727

    accuracy                           0.70      7227
   macro avg       0.63      0.67      0.64      7227
weighted avg       0.75      0.70      0.71      7227

Confusion matrix:
[[3958 1542]
 [ 659 1068]]


### What if we trained binary model from the start?

In [45]:
X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

y_train[y_train>1] = 1
y_test[y_test>1] = 1

In [46]:
NFOLDS = 3
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=RS)
scorer = make_scorer(balanced_accuracy_score, greater_is_better=True)  
model = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)
parameters = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'max_depth': [2, 4, 6, -1],
    'reg_alpha': [0.1, 1],
    'learning_rate': [0.01, 0.1]
}
RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=1)

In [47]:
RSCV.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


In [48]:
best_estimator = RSCV.best_estimator_

y_pred = best_estimator.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, best_estimator.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.6896993513391907
Test accuracy:  0.67204342790967
Report: 
              precision    recall  f1-score   support

           0       0.85      0.79      0.82      5500
           1       0.46      0.55      0.50      1727

    accuracy                           0.74      7227
   macro avg       0.65      0.67      0.66      7227
weighted avg       0.76      0.74      0.74      7227

Confusion matrix:
[[4367 1133]
 [ 777  950]]


In [49]:
RSCV.best_params_

{'reg_alpha': 0.1, 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.01}

In [None]:
# just a bit better