## Binary structure classification used in tree building: Step 2. Feature-rich approach

Train models, save the best one.

Output:
 - ``models/structure_predictor_baseline/*``

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import pandas as pd
import pickle
from utils.file_reading import read_edus, read_gold, read_negative, read_annotation
from utils.prepare_sequence import _prepare_sequence
from tqdm import tqdm_notebook as tqdm

random_state = 45

### Make a directory

In [3]:
import os

model_path = 'models/structure_predictor_baseline'
if not os.path.isdir(model_path):
    os.path.mkdir(model_path)

### Prepare train/test sets 

In [4]:
IN_PATH = 'data_structure'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

In [8]:
drop_columns = ['snippet_x', 'snippet_y', 'category_id', 
                'snippet_x_tmp', 'snippet_y_tmp', 
                'filename', 'order', 'postags_x', 'postags_y',
                'is_broken', 'tokens_x', 'tokens_y']

y_train, X_train = train_samples['relation'].to_frame(), train_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_dev, X_dev = dev_samples['relation'].to_frame(), dev_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_test, X_test = test_samples['relation'].to_frame(), test_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])

In [9]:
constants = [c for c in X_train.columns if len(set(X_train[c])) == 1]

In [10]:
X_train = X_train.drop(columns=constants)
X_dev = X_dev.drop(columns=constants)
X_test = X_test.drop(columns=constants)

### Classifiers training 

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler

std_scaler = MinMaxScaler().fit(X_train.values)

X_train = pd.DataFrame(std_scaler.transform(X_train.values), index=X_train.index, columns=X_train.columns)
X_dev = pd.DataFrame(std_scaler.transform(X_dev.values), index=X_dev.index, columns=X_dev.columns)
X_test = pd.DataFrame(std_scaler.transform(X_test.values), index=X_test.index, columns=X_test.columns)

In [12]:
X_train.shape

(47372, 2029)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


model = LogisticRegression(solver='lbfgs', C=0.0005, n_jobs=4, class_weight='balanced', random_state=random_state)
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=0.0005, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=4, penalty='l2',
                   random_state=45, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
from sklearn import metrics


predicted = model.predict(X_test)
print('pr:', metrics.precision_score(y_test, predicted))
print('re:', metrics.recall_score(y_test, predicted))
print('f1:', metrics.f1_score(y_test, predicted))
print()
print(metrics.classification_report(y_test, predicted))

pr: 0.6042488619119879
re: 0.709045584045584
f1: 0.6524660003277077

              precision    recall  f1-score   support

           0       0.84      0.77      0.81      5744
           1       0.60      0.71      0.65      2808

    accuracy                           0.75      8552
   macro avg       0.72      0.74      0.73      8552
weighted avg       0.77      0.75      0.76      8552



In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

params = {
 'C':[0.01, 0.1, 0.5, 1.,]
}
model = GridSearchCV(estimator=LinearSVC(random_state=random_state, class_weight='balanced'), 
                     param_grid=params, 
                     scoring = 'f1_macro',
                     n_jobs=-1, cv=10, verbose=2)
model.fit(X_train, y_train)
model.best_params_, model.best_score_

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:  7.9min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  9.7min finished
  y = column_or_1d(y, warn=True)


({'C': 0.1}, 0.7692939929204073)

In [15]:
from sklearn.svm import LinearSVC

model = LinearSVC(random_state=random_state, C=0.1, class_weight='balanced')
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LinearSVC(C=0.1, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=45, tol=0.0001,
          verbose=0)

In [17]:
from sklearn import metrics

predicted = model.predict(X_dev)
print('f1: %.2f'%(metrics.f1_score(y_dev, predicted)*100.))
print('pr: %.2f'%(metrics.precision_score(y_dev, predicted)*100.))
print('re: %.2f'%(metrics.recall_score(y_dev, predicted)*100.))
print()
print(metrics.classification_report(y_dev, predicted, digits=4))

f1: 72.03
pr: 66.91
re: 77.99

              precision    recall  f1-score   support

           0     0.8582    0.7754    0.8147      6875
           1     0.6691    0.7799    0.7203      4003

    accuracy                         0.7771     10878
   macro avg     0.7636    0.7777    0.7675     10878
weighted avg     0.7886    0.7771    0.7800     10878



In [18]:
predicted = model.predict(X_test)
print('f1: %.2f'%(metrics.f1_score(y_test, predicted)*100.))
print('pr: %.2f'%(metrics.precision_score(y_test, predicted)*100.))
print('re: %.2f'%(metrics.recall_score(y_test, predicted)*100.))
print()
print(metrics.classification_report(y_test, predicted, digits=4))

f1: 72.11
pr: 66.10
re: 79.31

              precision    recall  f1-score   support

           0     0.8744    0.7798    0.8244      5187
           1     0.6610    0.7931    0.7211      2808

    accuracy                         0.7845      7995
   macro avg     0.7677    0.7865    0.7727      7995
weighted avg     0.7995    0.7845    0.7881      7995



In [20]:
model.labels = ["0", "1"]
pickle.dump(model, open(os.path.join(model_path, 'model.pkl'), 'wb'))
pickle.dump(std_scaler, open(os.path.join(model_path, 'scaler.pkl'), 'wb'))
pickle.dump(constants+drop_columns, open(os.path.join(model_path, 'drop_columns.pkl'), 'wb'))