## Rhetorical relations classification used in tree building: Step 2. Feature-rich approach

Train models, save the best one.

Output:
 - ``models/relation_predictor_baseline/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pickle

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from catboost import Pool
from matplotlib import rcParams
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import column_or_1d

rcParams['pdf.fonttype'] = 42
rcParams['font.sans-serif'] = 'Arial'
import seaborn as sns

sns.set_style("whitegrid")
%matplotlib inline

random_state = 45

In [None]:
! mkdir models/relation_predictor_baseline

### Memorize useless data fields 

In [None]:
IN_PATH = 'data_labeling'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))
    
df = pd.concat([train_samples, dev_samples, test_samples])
df = df.fillna(0.)

constants = [c for c in df.drop(columns=['tokens_x', 'tokens_y']).columns if len(set(df[c])) == 1]
to_drop = ['snippet_x', 'snippet_y', 'snippet_x_tmp', 'snippet_y_tmp', 'filename', 'order', 'postags_x', 'postags_y',
           'tokens_x', 'tokens_y']
# df = df.drop(columns=constants)
del df
pickle.dump(constants + to_drop, open('models/relation_predictor_baseline/drop_columns.pkl', 'wb'))

### Prepare label encoder 

In [None]:
class MyLabelEncoder(LabelEncoder):

    def fit(self, y):
        y = column_or_1d(y, warn=True)
        self.classes_ = pd.Series(y)
        return self

### Prepare train/test sets

In [None]:
IN_PATH = 'data_labeling'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

In [None]:
counts = train_samples['relation'].value_counts(normalize=False).values
print(counts)

In [None]:
train_samples.relation.value_counts()

In [None]:
drop_columns = pickle.load(open('models/relation_predictor_baseline/drop_columns.pkl', 'rb'))

In [None]:
y_train, _X_train = train_samples['relation'].to_frame(), train_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_dev, _X_dev = dev_samples['relation'].to_frame(), dev_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_test, _X_test = test_samples['relation'].to_frame(), test_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])

In [None]:
scaler = StandardScaler().fit(_X_train)

X_scaled_np = scaler.transform(_X_train)
X_train = pd.DataFrame(X_scaled_np, index=_X_train.index)#, columns=X.columns)

X_scaled_np = scaler.transform(_X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=_X_dev.index)#, columns=X.columns)

X_scaled_np = scaler.transform(_X_test)
X_test = pd.DataFrame(X_scaled_np, index=_X_test.index)#, columns=X.columns)

pickle.dump(scaler, open('models/relation_predictor_baseline/scaler.pkl', 'wb'))

In [None]:
lab_encoder = LabelEncoder()
y_train = lab_encoder.fit_transform(y_train)
# y_dev = lab_encoder.transform(y_dev)
pickle.dump(lab_encoder, open('models/relation_predictor_baseline/label_encoder.pkl', 'wb'))

In [None]:
eval_dataset = Pool(data=X_dev,
                    label=y_dev)

catboost = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    custom_loss=['F1'],
    random_seed=random_state,
    verbose=2,
    loss_function='MultiClass',
    class_weights=counts / counts[-1],
    eval_metric='TotalF1'
)


fs_catboost = Pipeline([
  ('feature_selection', SelectFromModel(LogisticRegression(penalty='l1', solver='saga', C=1., n_jobs=-1))),
  ('classification', catboost)
])

logreg = LogisticRegression(random_state=random_state,
                            solver='lbfgs',
                            n_jobs=-1,
                            C=0.002,
                            multi_class='multinomial',
                            class_weight='balanced')

fs_catboost_plus_logreg = VotingClassifier(
    [('fs_catboost', fs_catboost), ('logreg', logreg)], voting='soft', n_jobs=-1)

In [None]:
fs_catboost_plus_logreg.fit(X_train, y_train)

In [None]:
pickle.dump(fs_catboost_plus_logreg, open('models/relation_predictor_baseline/model.pkl', 'wb'))

### Load & predict 

In [None]:
fs_catboost_plus_logreg = pickle.load(open('models/relation_predictor_baseline/model.pkl', 'rb'))
lab_encoder = pickle.load(open('models/relation_predictor_baseline/label_encoder.pkl', 'rb'))
scaler = pickle.load(open('models/relation_predictor_baseline/scaler.pkl', 'rb'))

#### 1. (Optional) Explore the feature importances in case we could clean up some vocabularies

First estimator in a pipeline, L1 logreg, will show us the features used in no way in the pipeline

In [None]:
lg1 = fs_catboost_plus_logreg.estimators_[0].steps[0][1]
lg1_filtered = [feature for feature in X_dev.keys() if not feature in lg1.get_feature_names_out(input_features = X_dev.keys())]

In [None]:
len(lg1_filtered), lg1_filtered  # Look at them, clean up the feature extractor's vocabulary

The second estimator, CatBoost, and it's feature importances

In [None]:
fil = pd.DataFrame({
    'feature': lg1.get_feature_names_out(input_features = _X_dev.keys()),
    'f': fs_catboost_plus_logreg.estimators_[0].steps[1][1].feature_importances_
})

fil.sort_values('f', ascending=False).head(5)

In [None]:
fil[fil.f == 0].feature.values

#### Prediction

In [None]:
X_scaled_np = scaler.transform(_X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=_X_dev.index)#, columns=X.columns)

In [None]:
predicted = lab_encoder.inverse_transform(fs_catboost_plus_logreg.predict(X_dev))

print('weighted f1: ', metrics.f1_score(y_dev.values, predicted, average='weighted'))
print('macro f1: ', metrics.f1_score(y_dev.values, predicted, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_dev.values, predicted))
print()
print(metrics.classification_report(y_dev, predicted, digits=4))

In [None]:
X_scaled_np = scaler.transform(_X_test)
X_test = pd.DataFrame(X_scaled_np, index=_X_test.index)

In [None]:
predicted = lab_encoder.inverse_transform(fs_catboost_plus_logreg.predict(X_test))

print('weighted f1: ', metrics.f1_score(y_test.values, predicted, average='weighted'))
print('macro f1: ', metrics.f1_score(y_test.values, predicted, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_test.values, predicted))
print()
print(metrics.classification_report(y_test, predicted, digits=4))

In [None]:
test_metrics = metrics.classification_report(y_test, predicted, digits=4, output_dict=True)
test_f1 = np.array(
    [test_metrics[label].get('f1-score') for label in test_metrics if type(test_metrics[label]) == dict]) * 100

test_f1