In [32]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, ParameterGrid
from sklearn.metrics import (make_scorer, mean_absolute_error, mean_squared_error, 
                             balanced_accuracy_score, classification_report, confusion_matrix) 


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def get_report(y_true, y_pred):
    print("Test accuracy: ", balanced_accuracy_score(y_true, y_pred))
    print("Report: ")
    print(classification_report(y_true, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))

In [4]:
RS = 42  # random state

ros = RandomOverSampler(random_state=RS)

In [5]:
train_data = pd.read_csv("train_data.csv", index_col=0)
labels = pd.read_csv('../data/labels.csv', index_col=0)[['citation_class']]

In [6]:
df = labels.join(train_data)

In [7]:
df.head()

Unnamed: 0_level_0,citation_class,refs_count,telegram_len,month,month_citation_rate_to_global_ratio,year_citation_rate_to_global_ratio,author_all_time_citation_rate_to_global_ratio,author_year_citation_rate_to_global_ratio,author_month_citation_rate_to_global_ratio,author_activity_frac_year,...,ner_246,ner_247,ner_248,ner_249,ner_250,ner_251,ner_252,ner_253,ner_254,ner_255
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3_atel,0,0,1010,12,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.665767,2.6017,-1.637177,-1.03683,0.364245,1.445063,0.833967,2.841267,-2.536283,0.211777
2_atel,0,0,907,12,0.0,0.0,0.0,0.0,0.0,1.0,...,-4.171,0.96369,-0.1353,-1.6264,1.1316,0.92214,-0.83215,0.95361,-2.3395,4.4517
4_atel,0,0,3607,1,0.0,0.0,0.0,0.0,0.0,1.0,...,-4.171,0.96369,-0.1353,-1.6264,1.1316,0.92214,-0.83215,0.95361,-2.3395,4.4517
5_atel,0,0,1509,1,0.0,0.0,0.0,0.0,0.0,1.0,...,-3.61455,0.216543,-0.777,-1.012407,0.580325,0.903885,-0.527262,0.909035,-2.08205,2.95085
6_atel,0,0,427,1,0.0,0.0,0.0,0.0,0.0,1.0,...,-3.6839,0.73854,-1.7234,-0.4246,-0.22381,1.1202,-0.11795,2.5209,-2.4573,-0.34259


In [8]:
df = df.iloc[1000:-100]  # throw away some of the first and some of the last activities

In [9]:
X, y = df.iloc[:, 1:].values, df.citation_class.values.reshape(-1, 1)

### KNN

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=RS, shuffle=True)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [14]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [16]:
y_pred = knn.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, knn.predict(X_train))}")
get_report(y_test, y_pred)  

Train accuracy: 0.9469644865118615
Test accuracy:  0.4780361870226169
Report: 
              precision    recall  f1-score   support

           0       0.85      0.74      0.79      5388
           1       0.28      0.41      0.33      1317
           2       0.23      0.29      0.25       372

    accuracy                           0.65      7077
   macro avg       0.45      0.48      0.46      7077
weighted avg       0.71      0.65      0.68      7077

Confusion matrix:
[[3968 1220  200]
 [ 615  540  162]
 [ 105  160  107]]


### Tune

In [17]:
NFOLDS = 4
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=RS)

scorer = make_scorer(balanced_accuracy_score, greater_is_better=True) 

model = KNeighborsClassifier()

parameters ={
    'n_neighbors': [2, 3, 5, 7, 10, 13],
}
RSCV = RandomizedSearchCV(model, parameters, scoring=scorer, cv=kf, n_iter=333, verbose=3)

In [18]:
RSCV.fit(X_train, y_train)

Fitting 4 folds for each of 6 candidates, totalling 24 fits
[CV 1/4] END .....................n_neighbors=2;, score=0.902 total time=  15.9s
[CV 2/4] END .....................n_neighbors=2;, score=0.902 total time=  13.9s
[CV 3/4] END .....................n_neighbors=2;, score=0.906 total time=  16.8s
[CV 4/4] END .....................n_neighbors=2;, score=0.900 total time=  15.5s
[CV 1/4] END .....................n_neighbors=3;, score=0.864 total time=  17.0s
[CV 2/4] END .....................n_neighbors=3;, score=0.864 total time=  16.2s
[CV 3/4] END .....................n_neighbors=3;, score=0.865 total time=  17.8s
[CV 4/4] END .....................n_neighbors=3;, score=0.859 total time=  18.0s
[CV 1/4] END .....................n_neighbors=5;, score=0.819 total time=  18.2s
[CV 2/4] END .....................n_neighbors=5;, score=0.818 total time=  16.1s
[CV 3/4] END .....................n_neighbors=5;, score=0.819 total time=  15.9s
[CV 4/4] END .....................n_neighbors=5;,

In [19]:
best_estimator = RSCV.best_estimator_

In [20]:
y_pred = best_estimator.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, best_estimator.predict(X_train))}")
get_report(y_test, y_pred)

Train accuracy: 0.9977510671513881
Test accuracy:  0.4522017268839061
Report: 
              precision    recall  f1-score   support

           0       0.82      0.84      0.83      5388
           1       0.31      0.29      0.30      1317
           2       0.26      0.23      0.24       372

    accuracy                           0.71      7077
   macro avg       0.46      0.45      0.46      7077
weighted avg       0.70      0.71      0.70      7077

Confusion matrix:
[[4540  721  127]
 [ 824  376  117]
 [ 172  115   85]]


In [21]:
RSCV.best_params_ 

{'n_neighbors': 2}

In [28]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y_train)

In [29]:
y_pred = knn.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, knn.predict(X_train))}")
get_report(y_test, y_pred)  

Train accuracy: 0.8207185667965807
Test accuracy:  0.5018927484575627
Report: 
              precision    recall  f1-score   support

           0       0.86      0.64      0.74      5388
           1       0.25      0.43      0.32      1317
           2       0.19      0.44      0.26       372

    accuracy                           0.59      7077
   macro avg       0.44      0.50      0.44      7077
weighted avg       0.72      0.59      0.63      7077

Confusion matrix:
[[3467 1501  420]
 [ 484  562  271]
 [  62  148  162]]


### SVC

In [33]:
clf = make_pipeline(StandardScaler(), SVC(gamma=2, C=1))
clf.fit(X_train, y_train)

In [34]:
y_pred = clf.predict(X_test)
print(f"Train accuracy: {balanced_accuracy_score(y_train, clf.predict(X_train))}")
get_report(y_test, y_pred)  

Train accuracy: 0.9999890828502495
Test accuracy:  0.3333333333333333
Report: 
              precision    recall  f1-score   support

           0       0.76      1.00      0.86      5388
           1       0.00      0.00      0.00      1317
           2       0.00      0.00      0.00       372

    accuracy                           0.76      7077
   macro avg       0.25      0.33      0.29      7077
weighted avg       0.58      0.76      0.66      7077

Confusion matrix:
[[5388    0    0]
 [1317    0    0]
 [ 372    0    0]]
