### In this notebook, I will try to predict the telegrams' citation class labels based purely on NER model's extracted entities representations using classical ML algorithm (LightGBM)

The problem is unbalanced, will use oversampling and class weights

In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import balanced_accuracy_score, f1_score, classification_report, confusion_matrix, make_scorer

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
RS = 42

In [4]:
def get_report(y_true, y_pred):
    print("Test accuracy: ", balanced_accuracy_score(y_true, y_pred))
    print("Report: ")
    print(classification_report(y_true, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))

In [5]:
def get_class_weights(target_labels, n_classes: int):
    total = len(target_labels)
    class_weights = {}
    for i in range(n_classes):
        class_i_cnt = len(np.where(target_labels==i)[0])
        class_weights[i] = (1 / class_i_cnt) * (total / n_classes)
    return class_weights

### Will compare the default params performance using 3 different sets of embeddinngs: 256-d, 128-d and CountVectors. The best performing by default option will be further fine-tuned. I will use `balanced` class weight instead of oversampling

In [8]:
df_128 = pd.read_csv('ent_vectors/ent_floret_128.csv', index_col=0)
df_256 = pd.read_csv('ent_vectors/ent_floret_256.csv', index_col=0)
df_cv = pd.read_csv('ent_vectors/end_cnt_vec.csv', index_col=0)
df_labels = pd.read_csv("../data/labels.csv", index_col=0)[['citation_class']]

#### floret 256 (the largest) goes first

In [37]:
df = df_labels.join(df_256)
df

Unnamed: 0_level_0,citation_class,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3_atel,0,1.980900,1.588900,0.344420,-0.675585,1.559450,-0.150880,0.616985,-1.793825,0.097455,...,2.005867,0.182510,2.103533,0.254155,-1.506143,-1.871467,0.290797,1.519500,-1.050553,-0.427197
2_atel,0,1.537100,1.558500,0.317990,0.580130,0.603100,0.244880,0.067371,-2.996200,-0.924190,...,2.557800,2.394600,1.911600,-2.379400,-1.092000,-1.462900,-1.492500,0.614020,-2.428900,-0.969490
4_atel,0,0.729690,0.246960,1.272286,-0.716588,1.791311,-0.711224,0.041107,-1.721376,0.197792,...,2.557800,2.394600,1.911600,-2.379400,-1.092000,-1.462900,-1.492500,0.614020,-2.428900,-0.969490
5_atel,0,-0.817397,0.153137,1.466150,0.205220,0.974807,-0.456092,-0.946557,-0.925483,0.317733,...,2.032410,1.774302,1.990075,-1.839227,-1.205250,-1.450075,-1.109337,0.754565,-2.122475,-0.889640
6_atel,0,1.134500,-1.603600,3.400700,1.729500,-1.968500,1.185200,-0.195770,-0.591150,0.033826,...,0.336810,1.823700,2.801400,-0.925720,-0.206510,-0.662900,-0.840800,1.044200,-1.794900,-0.734740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16033_atel,0,1.990000,0.172300,0.524815,-1.463795,2.136600,0.322680,-0.051050,-1.353675,-0.133300,...,2.342150,-1.071830,2.155625,0.627475,-1.984225,-1.590175,0.371883,0.776057,-1.462660,-0.567565
16034_atel,0,-0.287735,-0.276883,2.110924,2.424082,1.216592,-0.980903,0.410782,-2.174533,0.040912,...,1.044363,-1.104580,0.773450,0.167577,-1.264923,1.243037,0.628563,0.606427,-0.489600,-1.102963
16035_atel,0,1.990000,0.172300,0.524815,-1.463795,2.136600,0.322680,-0.051050,-1.353675,-0.133300,...,1.065000,-0.929450,0.788360,0.346870,-2.324300,-0.332140,2.337200,-0.525490,-1.845100,-0.456090
16036_atel,0,-0.798810,1.607800,1.103400,0.316810,2.407400,-1.261300,-0.771110,-0.524330,0.584350,...,2.005867,0.182510,2.103533,0.254155,-1.506143,-1.871467,0.290797,1.519500,-1.050553,-0.427197


In [38]:
df = df.iloc[:-100]

In [39]:
X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)

In [40]:
X.shape, y.shape

((48179, 512), (48179, 1))

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

In [42]:
gbm = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)
gbm.fit(X_train,y_train)

In [43]:
y_pred = gbm.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, gbm.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.7075949634145622
Test accuracy:  0.5635527730678377
Report: 
              precision    recall  f1-score   support

           0       0.86      0.74      0.80      5500
           1       0.36      0.32      0.34      1353
           2       0.18      0.63      0.28       374

    accuracy                           0.66      7227
   macro avg       0.47      0.56      0.47      7227
weighted avg       0.73      0.66      0.68      7227

Confusion matrix:
[[4076  679  745]
 [ 586  431  336]
 [  58   80  236]]


#### floret 128

In [44]:
df = df_labels.join(df_128)

df = df.iloc[:-100]

X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

In [45]:
gbm = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)
gbm.fit(X_train,y_train)

In [46]:
y_pred = gbm.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, gbm.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.7038162385919375
Test accuracy:  0.5602590322159906
Report: 
              precision    recall  f1-score   support

           0       0.86      0.74      0.79      5500
           1       0.36      0.32      0.34      1353
           2       0.17      0.63      0.27       374

    accuracy                           0.65      7227
   macro avg       0.47      0.56      0.47      7227
weighted avg       0.73      0.65      0.68      7227

Confusion matrix:
[[4047  675  778]
 [ 583  432  338]
 [  55   85  234]]


#### CV

In [99]:
df = df_labels.join(df_cv)

df = df.iloc[:-100]

X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

In [100]:
df_cv.shape

(48279, 2112)

In [48]:
gbm = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)
gbm.fit(X_train,y_train)

In [49]:
y_pred = gbm.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, gbm.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.6228089871055021
Test accuracy:  0.5767546628407461
Report: 
              precision    recall  f1-score   support

           0       0.87      0.71      0.78      5500
           1       0.38      0.27      0.32      1353
           2       0.16      0.75      0.26       374

    accuracy                           0.63      7227
   macro avg       0.47      0.58      0.45      7227
weighted avg       0.74      0.63      0.67      7227

Confusion matrix:
[[3911  556 1033]
 [ 552  366  435]
 [  46   48  280]]


#### the diff between 128 and 256 is insignificant in terms of accuracy, yet 128 is much faster to train on. I will try to tune and compare both 128D and CV vectors

In [74]:
# 128 first

df = df_labels.join(df_128)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

In [75]:
NFOLDS = 4
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=RS)

In [76]:
scorer = make_scorer(balanced_accuracy_score, greater_is_better=True)  

In [81]:
model = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)

In [82]:
parameters = {
    'n_estimators': [100, 250, 500, 1000],
    'max_depth': [2, 4, 6, -1],
    'reg_alpha': [0.1, 1],
    'learning_rate': [0.001, 0.01, 0.1]
}

In [83]:
RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=1)

In [84]:
RSCV.fit(X_train, y_train)

Fitting 4 folds for each of 96 candidates, totalling 384 fits


In [85]:
best_estimator = RSCV.best_estimator_

y_pred = best_estimator.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, best_estimator.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.6044032337801304
Test accuracy:  0.565757720678811
Report: 
              precision    recall  f1-score   support

           0       0.86      0.71      0.78      5500
           1       0.36      0.25      0.30      1353
           2       0.16      0.74      0.26       374

    accuracy                           0.62      7227
   macro avg       0.46      0.57      0.44      7227
weighted avg       0.73      0.62      0.66      7227

Confusion matrix:
[[3890  554 1056]
 [ 570  341  442]
 [  50   48  276]]


In [86]:
RSCV.best_params_

{'reg_alpha': 1, 'n_estimators': 250, 'max_depth': 2, 'learning_rate': 0.1}

In [87]:
# CV next

df = df_labels.join(df_cv)
df = df.iloc[:-100]
X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

In [88]:
model = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)

In [89]:
RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=3)

In [90]:
RSCV.fit(X_train, y_train)

Fitting 4 folds for each of 96 candidates, totalling 384 fits
[CV 1/4] END learning_rate=0.001, max_depth=2, n_estimators=100, reg_alpha=0.1;, score=0.519 total time=   3.4s
[CV 2/4] END learning_rate=0.001, max_depth=2, n_estimators=100, reg_alpha=0.1;, score=0.518 total time=   2.0s
[CV 3/4] END learning_rate=0.001, max_depth=2, n_estimators=100, reg_alpha=0.1;, score=0.507 total time=   2.0s
[CV 4/4] END learning_rate=0.001, max_depth=2, n_estimators=100, reg_alpha=0.1;, score=0.517 total time=   2.4s
[CV 1/4] END learning_rate=0.001, max_depth=2, n_estimators=100, reg_alpha=1;, score=0.519 total time=   2.4s
[CV 2/4] END learning_rate=0.001, max_depth=2, n_estimators=100, reg_alpha=1;, score=0.518 total time=   2.4s
[CV 3/4] END learning_rate=0.001, max_depth=2, n_estimators=100, reg_alpha=1;, score=0.507 total time=   2.1s
[CV 4/4] END learning_rate=0.001, max_depth=2, n_estimators=100, reg_alpha=1;, score=0.517 total time=   2.1s
[CV 1/4] END learning_rate=0.001, max_depth=2, n_e

In [91]:
best_estimator = RSCV.best_estimator_

y_pred = best_estimator.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, best_estimator.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.6033436606949757
Test accuracy:  0.5746861440806922
Report: 
              precision    recall  f1-score   support

           0       0.87      0.70      0.78      5500
           1       0.38      0.25      0.31      1353
           2       0.15      0.76      0.26       374

    accuracy                           0.62      7227
   macro avg       0.47      0.57      0.45      7227
weighted avg       0.74      0.62      0.66      7227

Confusion matrix:
[[3874  511 1115]
 [ 552  345  456]
 [  45   43  286]]


In [92]:
RSCV.best_params_

{'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': -1, 'learning_rate': 0.01}

### to binary and train (train with CV, but the floret 128 will be used for the final training due to much smaller size)

In [93]:
X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS)

y_train[y_train>1] = 1
y_test[y_test>1] = 1

In [95]:
NFOLDS = 3
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=RS)
scorer = make_scorer(balanced_accuracy_score, greater_is_better=True)  
model = lgb.LGBMClassifier(class_weight='balanced', random_state=RS)
parameters = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'max_depth': [2, 4, 6, -1],
    'reg_alpha': [0.1, 1],
    'learning_rate': [0.01, 0.1]
}
RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=1)

In [96]:
RSCV.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


In [97]:
best_estimator = RSCV.best_estimator_

y_pred = best_estimator.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, best_estimator.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.7230756677164287
Test accuracy:  0.6846844238563984
Report: 
              precision    recall  f1-score   support

           0       0.86      0.79      0.82      5500
           1       0.46      0.58      0.52      1727

    accuracy                           0.74      7227
   macro avg       0.66      0.68      0.67      7227
weighted avg       0.76      0.74      0.75      7227

Confusion matrix:
[[4350 1150]
 [ 728  999]]


In [98]:
RSCV.best_params_

{'reg_alpha': 1, 'n_estimators': 250, 'max_depth': -1, 'learning_rate': 0.1}