## Binary structure classification used in tree building: Step 2. Feature-rich approach

Train models, save the best one.

Output:
 - ``models/structure_predictor_baseline/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import pickle

random_state = 45

### Make a directory

In [None]:
import os

model_path = 'models/structure_predictor_baseline'
if not os.path.isdir(model_path):
    os.mkdir(model_path)

### Prepare train/test sets 

In [None]:
IN_PATH = 'data_structure'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

In [None]:
drop_columns = ['snippet_x', 'snippet_y', 'category_id', 
                'snippet_x_tmp', 'snippet_y_tmp', 
                'filename', 'order', 'postags_x', 'postags_y',
                'is_broken', 'tokens_x', 'tokens_y']

y_train, X_train = train_samples['relation'].to_frame(), train_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_dev, X_dev = dev_samples['relation'].to_frame(), dev_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_test, X_test = test_samples['relation'].to_frame(), test_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])

In [None]:
constants = [c for c in X_train.columns if len(set(X_train[c])) == 1]

In [None]:
X_train = X_train.drop(columns=constants)
X_dev = X_dev.drop(columns=constants)
X_test = X_test.drop(columns=constants)

In [None]:
from sklearn.preprocessing import MinMaxScaler

std_scaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train.values)
_X_train = pd.DataFrame(std_scaler.transform(X_train.values), index=X_train.index, columns=X_train.columns)

In [None]:
import statsmodels.api as sm

_X_train = sm.add_constant(_X_train)
model = sm.OLS(y_train, _X_train)
results = model.fit()

In [None]:
results.pvalues

In [None]:
# print(results.summary())

In [None]:
noisy_feats = [feature for feature, value in results.pvalues.iteritems() if value > 0.2 and feature != 'const']

X_train = X_train.drop(columns=noisy_feats)
X_dev = X_dev.drop(columns=noisy_feats)
X_test = X_test.drop(columns=noisy_feats)

### Classifiers training 

In [None]:
from sklearn.preprocessing import MinMaxScaler

std_scaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train.values)

X_train = pd.DataFrame(std_scaler.transform(X_train.values), index=X_train.index, columns=X_train.columns)
X_dev = pd.DataFrame(std_scaler.transform(X_dev.values), index=X_dev.index, columns=X_dev.columns)
X_test = pd.DataFrame(std_scaler.transform(X_test.values), index=X_test.index, columns=X_test.columns)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier

n_estimators = 10
model = BaggingClassifier(LinearSVC(random_state=random_state, C=0.01, class_weight='balanced'),
                          max_samples=1. / n_estimators, max_features=0.9, 
                          n_estimators=n_estimators, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
from sklearn import metrics

predicted = model.predict(X_dev)
print('f1: %.2f'%(metrics.f1_score(y_dev, predicted)*100.))
print('pr: %.2f'%(metrics.precision_score(y_dev, predicted)*100.))
print('re: %.2f'%(metrics.recall_score(y_dev, predicted)*100.))
print()
print(metrics.classification_report(y_dev, predicted, digits=4))

In [None]:
predicted = model.predict(X_test)
print('f1: %.2f'%(metrics.f1_score(y_test, predicted)*100.))
print('pr: %.2f'%(metrics.precision_score(y_test, predicted)*100.))
print('re: %.2f'%(metrics.recall_score(y_test, predicted)*100.))
print()
print(metrics.classification_report(y_test, predicted, digits=4))

In [None]:
model.labels = ["0", "1"]
pickle.dump(model, open(os.path.join(model_path, 'model.pkl'), 'wb'))
pickle.dump(std_scaler, open(os.path.join(model_path, 'scaler.pkl'), 'wb'))
pickle.dump(constants+drop_columns+noisy_feats, open(os.path.join(model_path, 'drop_columns.pkl'), 'wb'))