In [1]:
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer, matthews_corrcoef, accuracy_score, balanced_accuracy_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold

# Read features

In [2]:
from src.read_features import *
from src.threshold_search import *

In [3]:
STATE=42
model_dir = "./bert-base-cased-en-cola_32_3e-05_lr_0.01_decay_balanced/"
data_dir = "./data/en-cola/"
heads = 16 if "roberta" in model_dir.lower() else 12
layers = 24 if "roberta" in model_dir.lower() else 12

In [4]:
file_type = ".csv" # .csv or .tsv
train_set_name, valid_set_name, test_set_name = ("train", "dev","test") 
data_args = dict(((k, eval(k)) for k in ("data_dir", "file_type")))
(sents_train, y_train), (sents_valid, y_valid), (sents_test, y_test) = list(map(lambda x_: read_labels(x_, **data_args), 
                                                [x_ for x_ in (train_set_name, valid_set_name, test_set_name)]))

In [5]:
topological_thr = 6
features_dir = model_dir + "/features/"

In [6]:
kwargs = dict(((k, eval(k)) for k in ("features_dir", "model_dir", "topological_thr")))
kwargs["heads"] = heads
kwargs["layers"] =layers
X_train, X_valid, X_test = list(map(lambda x_: load_features(x_, **kwargs), [x_ for x_ in (train_set_name, valid_set_name, test_set_name)]))

Loading train features...: 100%|██████████| 432/432 [00:18<00:00, 23.10it/s]
Loading dev features...: 100%|██████████| 432/432 [00:02<00:00, 176.11it/s]
Loading test features...: 100%|██████████| 432/432 [00:02<00:00, 176.37it/s]


In [7]:
# # Exclude weakly connected components equal to b0 Betti number
X_train = X_train.iloc[:, ~X_train.columns.str.startswith('w')]
X_valid = X_valid.loc[:, X_train.columns]
X_test = X_test.loc[:, X_train.columns]

# # # # Removing constant and quasi-constant features
var_thr = VarianceThreshold(threshold = 0.00001)
var_thr.fit(X_valid)
not_constant_f = var_thr.get_support()
X_train = X_train.loc[:, not_constant_f]
X_valid = X_valid.loc[:, not_constant_f]
X_test = X_test.loc[:, not_constant_f]

In [8]:
X_valid.shape

(527, 8800)

In [9]:
sents_train.values

array(["Our friends won't buy this analysis, let alone the next one we propose.",
       "One more pseudo generalization and I'm giving up.",
       "One more pseudo generalization or I'm giving up.", ...,
       'It is easy to slay the Gorgon.',
       'I had the strangest feeling that I knew you.',
       'What all did you get for Christmas?'], dtype=object)

# Acceptability judgements with PCA

In [10]:
def score_mcc(y_true, y_pred):
    return matthews_corrcoef(y_true, y_pred)
score_mcc_ = make_scorer(score_mcc, greater_is_better=True)
# Print summary statistics of the results
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [11]:
X_train.shape, y_train.shape

((8551, 8800), (8551,))

In [12]:
# Parameters grid
params = {'tol': 1e-6, 'random_state': STATE, 'solver': 'liblinear', "penalty": 'l1'}
N_FEATURES_OPTIONS =  np.arange(100,300,50)
C_OPTIONS = [1e-3, 0.01, 0.1]
CLASS_WEIGHT = [None]
#'reduce_dim__n_components': N_FEATURES_OPTIONS,
from sklearn.preprocessing import *
all_preprocessors = [
    StandardScaler(),QuantileTransformer(n_quantiles=100), MinMaxScaler(), RobustScaler()
]
# all_preprocessors = [QuantileTransformer(n_quantiles=100)]
max_iter_range = [25,100]
#'clf__max_iter':max_iter_range, 
params_grid = {'reduce_dim__n_components': N_FEATURES_OPTIONS,
               'clf__max_iter':max_iter_range, 'clf__C': C_OPTIONS,
               'clf__class_weight': CLASS_WEIGHT, 
               }# 'scaler':all_preprocessors 'scaler':all_preprocessors
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('reduce_dim', PCA(whiten=True, random_state=STATE)),
    ('clf', LogisticRegression(**params))])
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=STATE)
ids=(np.arange(0, X_train.shape[0]), np.arange(X_train.shape[0], X_train.shape[0]+X_valid.shape[0]))
X=np.concatenate((X_train, X_valid), axis=0)
y=np.concatenate((y_train, y_valid), axis=0)
clf_ = GridSearchCV(pipeline, cv=kfold, verbose=4, param_grid=params_grid, scoring="accuracy", n_jobs=30)
start = time()
clf_.fit(X_train, y_train)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(clf_.cv_results_['params'])))
report(clf_.cv_results_, n_top=5)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
GridSearchCV took 88.50 seconds for 24 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.971 (std: 0.002)
Parameters: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__max_iter': 25, 'reduce_dim__n_components': 250}

Model with rank: 1
Mean validation score: 0.971 (std: 0.002)
Parameters: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__max_iter': 100, 'reduce_dim__n_components': 250}

Model with rank: 3
Mean validation score: 0.971 (std: 0.003)
Parameters: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__max_iter': 25, 'reduce_dim__n_components': 200}

Model with rank: 3
Mean validation score: 0.971 (std: 0.003)
Parameters: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__max_iter': 100, 'reduce_dim__n_components': 200}

Model with rank: 5
Mean validation score: 0.970 (std: 0.002)
Parameters: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__max_iter': 25, 'reduce_dim__n_components': 100}

Model with ra

In [13]:
print_scores(clf_.best_estimator_.predict(X_valid), y_valid)
print_scores(clf_.best_estimator_.predict(X_test), y_test)

Bal. Acc. =	0.847
Accuracy =	0.856
MCC score =	0.649	

Bal. Acc. =	0.79
Accuracy =	0.806
MCC score =	0.527	



(0.8062015503875969, 0.5270938469686087)

In [14]:
print_scores(clf_.best_estimator_.predict(X_valid), y_valid)
print_scores(clf_.best_estimator_.predict(X_test), y_test)

Bal. Acc. =	0.847
Accuracy =	0.856
MCC score =	0.649	

Bal. Acc. =	0.79
Accuracy =	0.806
MCC score =	0.527	



(0.8062015503875969, 0.5270938469686087)

In [15]:
yhat_train = clf_.best_estimator_.predict_proba(X_train)[:, 1]
thr_mcc, _, _ = print_thresholds(y_train, yhat_train)
yhat_valid = clf_.best_estimator_.predict_proba(X_valid)[:, 1]
y_pred_thr_mcc_valid = np.where(yhat_valid >= thr_mcc, 1, 0)
print_scores(y_valid, y_pred_thr_mcc_valid)
yhat_test = clf_.best_estimator_.predict_proba(X_test)[:, 1]
y_pred_thr_mcc = np.where(yhat_test >= thr_mcc, 1, 0)
print_scores(y_test, y_pred_thr_mcc)

Search for the best threshold by maximizing "balanced_accuracy":
Threshold =	0.8153923144290629
Bal. Acc. =	0.9759274245665875
Accuracy =	0.973687288036487
MCC score =	0.9383722483301667	

Search for the best threshold by maximizing "matthews_corrcoef":
Threshold =	0.5575001697909184
Bal. Acc. =	0.9724230901796279
Accuracy =	0.976026195766577
MCC score =	0.942590731097539	

Bal. Acc. =	0.803
Accuracy =	0.856
MCC score =	0.649	

Bal. Acc. =	0.749
Accuracy =	0.812
MCC score =	0.543	



(0.812015503875969, 0.5427800325174282)

In [16]:
yhat_train = clf_.best_estimator_.predict_proba(X_valid)[:, 1]
thr_mcc, _, _ = print_thresholds(y_valid, yhat_train)
yhat_test = clf_.best_estimator_.predict_proba(X_test)[:, 1]
y_pred_thr_mcc = np.where(yhat_test >= thr_mcc, 1, 0)
print_scores(y_test, y_pred_thr_mcc)

Search for the best threshold by maximizing "balanced_accuracy":
Threshold =	0.8483109363718219
Bal. Acc. =	0.8423135464231355
Accuracy =	0.8671726755218216
MCC score =	0.6870130672514878	

Search for the best threshold by maximizing "matthews_corrcoef":
Threshold =	0.8483109363718219
Bal. Acc. =	0.8423135464231355
Accuracy =	0.8671726755218216
MCC score =	0.6870130672514878	

Bal. Acc. =	0.773
Accuracy =	0.81
MCC score =	0.553	



(0.810077519379845, 0.5534967168498809)

In [17]:
# import joblib
# joblib.dump(best_pipe, 'grid_search_bert_trained_en_tda_all.pkl')