In [None]:
%load_ext autoreload
%autoreload 2

### Experiments with the label classification method presented in "CLASSIFICATION MODELS FOR RST DISCOURSE PARSING OF TEXTS IN RUSSIAN"
http://www.dialog-21.ru/media/4595/chistovaevplusetal-076.pdf

### 1. Get code for feature extraction

In [None]:
%%bash

rm -r rurst2019
mkdir rurst2019
cd rurst2019
wget -q http://nlp.isa.ru/paper_dialog2019/utils/meaningfulwords_v3.py
wget -q http://nlp.isa.ru/paper_dialog2019/utils/language_features.py
wget -q http://nlp.isa.ru/paper_dialog2019/utils/features_processor.py

# some external modules are structurally the same but have other paths
sed -i "s|utils/tf_idf_pipeline.save|models/tf_idf/pipeline.pkl|g" features_processor.py  # tf-idf pipeline
sed -i "s|models_w2v/model2_tokenized|models/w2v/segmentator/model2_tokenized|g" features_processor.py  # w2v model

# also some fixes of the feature extractor
sed -i "s|'common_root_fpos',|\n|g" features_processor.py
sed -i "s|'common_root_att',|\n|g" features_processor.py
sed -i "s|'common_root'|\n|g" features_processor.py
sed -i "s|/ len(row))|/ (len(row) + 1e-8))|g" features_processor.py
sed -i "s|'tokens_x', 'tokens_y',|\n|g" features_processor.py
#sed -i "s|return [self.annotations['tokens'][i].text for i in range(begin, end)]|result = [self.annotations['tokens'][i].text for i in range(begin, end)]\n        if result:\n            return result\n        return ['_']|g"

### 2. Extract features 
Same way as in ``1_data_extraction.ipynb`` but with another interface

In [None]:
from rurst2019.features_processor import FeaturesProcessor

features_processor = FeaturesProcessor(verbose=False)

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm
from utils.file_reading import read_gold, read_annotation


IN_PATH = 'data/'
for file in tqdm(glob.glob("%s*.json" % IN_PATH)):
    # print(file)
    table = read_gold(file.replace('.json', ''))#pd.read_json(file)
    table = table[table.snippet_x.map(len) > 0]
    table = table[table.snippet_y.map(len) > 0]
    annot = read_annotation(file.replace('.json', ''))#pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    features = features_processor(table, 
                                  annot)
    features.to_pickle(file.replace('.json', '.gold.pkl.oldf'))

### 3. Classification model 

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from matplotlib import rcParams
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
#from imblearn.over_sampling import SMOTE
#from imblearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from keras.wrappers.scikit_learn import KerasClassifier
from keras import backend as K
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical
from keras.layers import Input, Dense, Dropout, BatchNormalization, Activation
from keras.models import Model
from keras.optimizers import Adam
import tensorflow as tf
from catboost import CatBoostClassifier
from time import time
import pickle

rcParams['pdf.fonttype'] = 42
rcParams['font.sans-serif'] = 'Arial'
import seaborn as sns

sns.set_style("whitegrid")
%matplotlib inline

random_state = 42

In [None]:
from utils.train_test_split import split_train_dev_test

train, dev, test = split_train_dev_test('./data')

In [None]:
import pandas as pd

random_state = 41

train_samples = []
test_samples = []
dev_samples = []

for file in train:
    train_samples.append(pd.read_pickle(file.replace('.edus', '.gold.pkl.oldf')))

for file in dev:
    dev_samples.append(pd.read_pickle(file.replace('.edus', '.gold.pkl.oldf')))
    
for file in test:
    test_samples.append(pd.read_pickle(file.replace('.edus', '.gold.pkl.oldf')))

train_samples = pd.concat(train_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
dev_samples = pd.concat(dev_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
test_samples = pd.concat(test_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)

In [None]:
TARGET = 'category_id'
MAX_LEN = 100

train_samples[TARGET] = train_samples[TARGET].replace(['antithesis_r',], 'contrast_m')
train_samples[TARGET] = train_samples[TARGET].replace(['cause_r', 'effect_r'], 'cause-effect_r')
train_samples[TARGET] = train_samples[TARGET].replace(['conclusion_r',], 'restatement_m')
train_samples[TARGET] = train_samples[TARGET].replace(['evaluation_r'], 'interpretation-evaluation_r')
train_samples[TARGET] = train_samples[TARGET].replace(['motivation_r',], 'condition_r')
train_samples['relation'] = train_samples[TARGET].map(lambda row: row[:-1]) + train_samples['order']
train_samples['relation'] = train_samples['relation'].replace(['restatement_SN', 'restatement_NS'], 'restatement_NN')
train_samples['relation'] = train_samples['relation'].replace(['contrast_SN', 'contrast_NS'], 'contrast_NN')
train_samples['relation'] = train_samples['relation'].replace(['solutionhood_NS', 'preparation_NS'], 'elaboration_NS')
train_samples['relation'] = train_samples['relation'].replace(['concession_SN', 'evaluation_SN', 
                                                               'elaboration_SN', 'evidence_SN'], 'preparation_SN')
train_samples = train_samples[train_samples.tokens_x.map(len) < MAX_LEN]
train_samples = train_samples[train_samples.tokens_y.map(len) < MAX_LEN]

dev_samples[TARGET] = dev_samples[TARGET].replace(['antithesis_r',], 'contrast_m')
dev_samples[TARGET] = dev_samples[TARGET].replace(['cause_r', 'effect_r'], 'cause-effect_r')
dev_samples[TARGET] = dev_samples[TARGET].replace(['conclusion_r',], 'restatement_m')
dev_samples[TARGET] = dev_samples[TARGET].replace(['evaluation_r'], 'interpretation-evaluation_r')
dev_samples[TARGET] = dev_samples[TARGET].replace(['motivation_r',], 'condition_r')
dev_samples['relation'] = dev_samples[TARGET].map(lambda row: row[:-1]) + dev_samples['order']
dev_samples['relation'] = dev_samples['relation'].replace(['restatement_SN', 'restatement_NS'], 'restatement_NN')
dev_samples['relation'] = dev_samples['relation'].replace(['contrast_SN', 'contrast_NS'], 'contrast_NN')
dev_samples['relation'] = dev_samples['relation'].replace(['solutionhood_NS', 'preparation_NS'], 'elaboration_NS')
dev_samples['relation'] = dev_samples['relation'].replace(['concession_SN', 'evaluation_SN', 
                                                           'elaboration_SN', 'evidence_SN'], 'preparation_SN')
dev_samples = dev_samples[dev_samples.tokens_x.map(len) < MAX_LEN]
dev_samples = dev_samples[dev_samples.tokens_y.map(len) < MAX_LEN]

test_samples[TARGET] = test_samples[TARGET].replace(['antithesis_r',], 'contrast_m')
test_samples[TARGET] = test_samples[TARGET].replace(['cause_r', 'effect_r'], 'cause-effect_r')
test_samples[TARGET] = test_samples[TARGET].replace(['conclusion_r',], 'restatement_m')
test_samples[TARGET] = test_samples[TARGET].replace(['evaluation_r'], 'interpretation-evaluation_r')
test_samples[TARGET] = test_samples[TARGET].replace(['motivation_r',], 'condition_r')
test_samples['relation'] = test_samples[TARGET].map(lambda row: row[:-1]) + test_samples['order']
test_samples['relation'] = test_samples['relation'].replace(['restatement_SN', 'restatement_NS'], 'restatement_NN')
test_samples['relation'] = test_samples['relation'].replace(['contrast_SN', 'contrast_NS'], 'contrast_NN')
test_samples['relation'] = test_samples['relation'].replace(['solutionhood_NS', 'preparation_NS'], 'elaboration_NS')
test_samples['relation'] = test_samples['relation'].replace(['concession_SN', 'evaluation_SN', 
                                                             'elaboration_SN', 'evidence_SN'], 'preparation_SN')
test_samples = test_samples[test_samples.tokens_x.map(len) < MAX_LEN]
test_samples = test_samples[test_samples.tokens_y.map(len) < MAX_LEN]

TARGET = 'relation'

In [None]:
train_samples[TARGET].value_counts()

In [None]:
import numpy as np

counts = np.array([3634., 3194., 1235., 819., 742., 725., 690., 593., 546., 
                   540., 507., 487., 393., 388., 280., 271., 258., 174.,
                   159., 130., 107., 105., 91])

In [None]:
train_samples.head(1).values

In [None]:
drop_columns = ['snippet_x', 'snippet_y', 'order', 'filename', 'tokens_x', 'tokens_y']
y_train, X_train = train_samples[TARGET].to_frame(), train_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id'])
y_dev, X_dev = dev_samples[TARGET].to_frame(), dev_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id'])
y_test, X_test = test_samples[TARGET].to_frame(), test_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id'])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)

X_scaled_np = scaler.transform(X_train)
X_train = pd.DataFrame(X_scaled_np, index=X_train.index)#, columns=X.columns)

X_scaled_np = scaler.transform(X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=X_dev.index)#, columns=X.columns)

X_scaled_np = scaler.transform(X_test)
X_test = pd.DataFrame(X_scaled_np, index=X_test.index)#, columns=X.columns)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

lab_encoder = LabelEncoder()
y_train = lab_encoder.fit_transform(y_train)

In [None]:
from catboost import Pool

logreg = LogisticRegression(random_state=random_state,
                            solver='lbfgs',
                            n_jobs=8,
                            C=0.002,
                            multi_class='multinomial',
                            class_weight='balanced')

eval_dataset = Pool(data=X_dev,
                    label=y_dev)

catboost = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.1,
    custom_loss=['F1'],
    random_seed=random_state,
    verbose=0,
    loss_function='MultiClass',
    #task_type='GPU',
    class_weights=counts / counts[-1]
)

fs_catboost = Pipeline([
  ('feature_selection', SelectFromModel(LogisticRegression(solver='saga', penalty='l1', C=1., n_jobs=-1))),
  ('classification', catboost)
])

logreg = LogisticRegression(random_state=random_state,
                            solver='lbfgs',
                            n_jobs=-1,
                            C=0.002,
                            multi_class='multinomial',
                            class_weight='balanced')

fs_catboost_plus_logreg = VotingClassifier([('fs_catboost', fs_catboost), ('logreg', logreg)], voting='soft')

In [None]:
fs_catboost_plus_logreg.fit(X_train, y_train)

In [None]:
from sklearn import metrics

predicted = lab_encoder.inverse_transform(fs_catboost_plus_logreg.predict(X_dev))

print('weighted f1: ', metrics.f1_score(y_dev.values, predicted, average='weighted'))
print('macro f1: ', metrics.f1_score(y_dev.values, predicted, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_dev.values, predicted))
print()
print(metrics.classification_report(y_dev, predicted, digits=4))
print('macro precision: %.2f'%(metrics.precision_score(y_dev, predicted, average='macro')*100.))
print('macro recall: %.2f'%(metrics.recall_score(y_dev, predicted, average='macro')*100.))

In [None]:
from sklearn import metrics

predicted = lab_encoder.inverse_transform(fs_catboost_plus_logreg.predict(X_test))

print('weighted f1: ', metrics.f1_score(y_test.values, predicted, average='weighted'))
print('macro f1: ', metrics.f1_score(y_test.values, predicted, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_test.values, predicted))
print()
print(metrics.classification_report(y_test, predicted, digits=4))
print('macro precision: %.2f'%(metrics.precision_score(y_test, predicted, average='macro')*100.))
print('macro recall: %.2f'%(metrics.recall_score(y_test, predicted, average='macro')*100.))

In [None]:
pickle.dump(fs_catboost_plus_logreg, open('models/dialog_model.pkl', 'wb'))