In [1]:
import multiprocessing
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import make_scorer, matthews_corrcoef, accuracy_score, balanced_accuracy_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm

In [2]:
multiprocessing.cpu_count()

40

# Read features

In [3]:
from read_features import *
from threshold_search import *

In [4]:
STATE = 42
model_dir = "./ruBert-base-ru-cola_32_0.0001_lr_0.1_decay_balanced/"
data_dir = "./data/ru-cola/"

In [5]:
file_type = ".csv" # .csv or .tsv
train_set_name, valid_set_name, test_set_name = ("train", "dev","test") 
data_args = dict(((k, eval(k)) for k in ("data_dir", "file_type")))
(sents_train, y_train), (sents_valid, y_valid), (sents_test, y_test) = list(map(lambda x_: read_labels(x_, **data_args), 
                                                [x_ for x_ in (train_set_name, valid_set_name, test_set_name)]))

In [6]:
topological_thr = 6
features_dir = model_dir + "/features/"

In [7]:
kwargs = dict(((k, eval(k)) for k in ("features_dir", "model_dir", "topological_thr")))
X_train, X_valid, X_test = list(map(lambda x_: load_features(x_, **kwargs), [x_ for x_ in (train_set_name, valid_set_name, test_set_name)]))

Loading train features...: 100%|██████████| 432/432 [00:16<00:00, 25.58it/s]
Loading dev features...: 100%|██████████| 432/432 [00:03<00:00, 133.46it/s]
Loading test features...: 100%|██████████| 432/432 [00:04<00:00, 90.69it/s] 


In [8]:
# # Exclude weakly connected components equal to b0 Betti number
# X_train = X_train.iloc[:, ~X_train.columns.str.startswith('w')]
# X_valid = X_valid.iloc[:, ~X_valid.columns.str.startswith('w')]
# X_test = X_test.iloc[:, ~X_test.columns.str.startswith("w")]

# Removing constant and quasi-constant features
var_thr = VarianceThreshold(threshold = 0.000001)
var_thr.fit(X_train)
not_constant_f = var_thr.get_support()
X_train = X_train.loc[:, not_constant_f]
X_valid = X_valid.loc[:, not_constant_f]
X_test = X_test.loc[:, not_constant_f]

# Feature selection

In [9]:
train_features = dict()
values_1 = np.argwhere(y_train == 1) # correct sents
values_0 = np.argwhere(y_train == 0) # incorrect sents
for f in X_train.columns:
    top_values_0 = X_train.loc[:, f].values[values_0]
    top_values_1 = X_train.loc[:, f].values[values_1]
    _, pval = mannwhitneyu(top_values_0, top_values_1,  alternative = 'two-sided')
    if pval:
        train_features[f] = pval

In [10]:
# Print summary statistics of the results
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [12]:
C = np.corrcoef(X_train, rowvar=False)
np.fill_diagonal(C, 0)
C[np.isnan(C)] = 0
corrs = np.abs(C[:,-1])
feature_corr_train = X_train.columns[corrs!=0]
corr_weights = np.argsort(corrs[corrs!= 0])[::-1]

In [13]:
params = {'random_state': STATE,'tol': 1e-6, 'max_iter': 10000, "C": 0.1, 
          "penalty":"l1","solver": "liblinear", "class_weight": 'balanced'}

log_reg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(**params))])
log_reg_pipeline.fit(X_train, y_train)
log_reg_coefs = np.abs(log_reg_pipeline.named_steps["clf"].coef_).reshape(-1)
feature_logreg_train = X_train.columns[log_reg_coefs!=0]
log_reg_weights = np.argsort(log_reg_coefs[log_reg_coefs!= 0])[::-1]

In [14]:
svm_params = {'random_state': STATE,'tol': 1e-6, 'max_iter': 100000, 
          "class_weight": 'balanced', "kernel":"linear"}
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', svm.SVC(**svm_params))])
svm_pipeline.fit(X_train, y_train)
svc_coefs = np.abs(svm_pipeline.named_steps["clf"].coef_).reshape(-1)
feature_svc_train = X_train.columns[svc_coefs!=0]
svc_weights = np.argsort(feature_svc_train[feature_svc_train!= 0])[::-1]

In [15]:
# Feature rankers
mwu_rank = [i[0] for i in sorted(train_features.items(), key=itemgetter(1))]
corr_rank = feature_corr_train[corr_weights]
log_reg_rank = feature_logreg_train[log_reg_weights]
svc_coefs = feature_svc_train[svc_weights]

In [16]:
# parameters grid
params = {'random_state': STATE}
N_FEATURES_OPTIONS =  np.arange(10,110,10)
C_OPTIONS = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
CLASS_WEIGHT = [None, 'balanced']
params_grid = {'clf__C': C_OPTIONS, 'clf__class_weight': CLASS_WEIGHT, 
               'clf__penalty' :["l1", "l2"], 
               "clf__solver":["liblinear"]}

In [18]:
for rank_type, range_f in zip(["corr", "mwu", "logreg", "svm" ],
                              [corr_rank,mwu_rank, log_reg_rank, svc_coefs]):
    for n_features in [10,100,500,1000,-1]:
        X_train_selected = X_train.loc[:, range_f[:n_features]]
        X_valid_selected = X_valid.loc[:, range_f[:n_features]]
        X_test_selected = X_test.loc[:, range_f[:n_features]]
        print(f"Ranker:{rank_type} #features: {X_train_selected.shape[1]}")
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', LogisticRegression(**params))])
        # Stratified sampling from training dataset
        kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=STATE)
        clf_cv = GridSearchCV(pipeline, cv=kfold, verbose=4,  param_grid=params_grid, 
                               n_jobs=15,scoring="accuracy") # ,pre_dispatch = 2
        start = time()
        clf_cv.fit(X_train_selected.values, y_train)
        print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
              % (time() - start, len(clf_cv.cv_results_['params'])))
        report(clf_cv.cv_results_, n_top=1)
        scl = clf_cv.best_estimator_.named_steps['scaler']
        model = clf_cv.best_estimator_.named_steps['clf']
        X_scaled_train = scl.transform(X_train_selected.values)
        model.fit(X_train_selected, y_train)
        yhat_train = clf_cv.predict_proba(X_train_selected)[:, 1]
        thr_mcc, _, _ = print_thresholds(y_train, yhat_train)
        yhat_valid = clf_cv.predict_proba(X_valid_selected)[:, 1]
        y_pred_thr_mcc_valid = np.where(yhat_valid >= thr_mcc, 1, 0)
        print_scores(y_valid, y_pred_thr_mcc_valid)
        yhat_test = clf_cv.predict_proba(X_test_selected)[:, 1]
        y_pred_thr_mcc = np.where(yhat_test >= thr_mcc, 1, 0)
        print_scores(y_test, y_pred_thr_mcc)
#         with open(f'{model_dir}/{ranker_type}_{n_features}.npy', 'wb') as f:
#             np.save(f,y_pred_thr_mcc)

Ranker:corr #features: 10
Fitting 3 folds for each of 40 candidates, totalling 120 fits
GridSearchCV took 1.86 seconds for 40 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.913 (std: 0.004)
Parameters: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}

Search for the best threshold by maximizing "matthews_corrcoef":
Threshold =	0.987274728615001
Bal. Acc. =	0.9298574420020889
Accuracy =	0.9096454441479223
MCC score =	0.7964487622022239	

Search for the best threshold by maximizing "balanced_accuracy":
Threshold =	0.987274728615001
Bal. Acc. =	0.9298574420020889
Accuracy =	0.9096454441479223
MCC score =	0.7964487622022239	

Bal. Acc. =	0.705
Accuracy =	0.789
MCC score =	0.426	

Bal. Acc. =	0.589
Accuracy =	0.572
MCC score =	0.171	

Ranker:corr #features: 100
Fitting 3 folds for each of 40 candidates, totalling 120 fits
GridSearchCV took 2.16 seconds for 40 candidate parameter settings.
Model with rank: 1
Mean validati