## Binary structure classification used in tree building

1. Create train and test sets; Save negative samples of file ``filename.rs3`` as `filename.neg`
2. Train models, save the best one.

Output:
 - ``data/*.neg``
 - ``models/structure_predictor/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import pandas as pd
import pickle

In [None]:
text_html_map = {
    r'\n': r' ',
    r'&gt;': r'>',
    r'&lt;': r'<',
    r'&amp;': r'&',
    r'&quot;': r'"',
    r'&ndash;': r'–',
    r'##### ': r'',
    r'\\\\\\\\': r'\\',
    r'  ': r' ',
    r'——': r'-',
    r'—': r'-',
    r'/': r'',
    r'\^': r'',
    r'^': r'',
    r'±': r'+',
    r'y': r'у',
    r'x': r'х'
}

def read_edus(filename):
    edus = []
    with open(filename + '.edus', 'r') as f:
        for line in f.readlines():
            edu = str(line.strip())
            for key, value in text_html_map.items():
                edu = edu.replace(key, value)
            edus.append(edu)
    return edus

def read_gold(filename):
    df = pd.read_json(filename + '.json')
    for key in text_html_map.keys():
        df['snippet_x'].replace(key, text_html_map[key], regex=True, inplace=True)
        df['snippet_y'].replace(key, text_html_map[key], regex=True, inplace=True)

    return df

def read_negative(filename):
    return pd.read_json(filename + '.json.neg')

def read_annotation(filename):
    annot = pd.read_pickle(filename + '.annot.pkl')
    for key in text_html_map.keys():
        annot['text'] = annot['text'].replace(key, text_html_map[key])
        for token in annot['tokens']:
            token.text = token.text.replace(key, text_html_map[key])
    
    return annot

In [None]:
def to_merge_gr(scores):
    return scores.index(True)

def create_training_set_greedy(edus, gold):    
    def in_gold(pair):
        tmp = gold[(gold['snippet_x'] == pair[0].strip()) & (gold['snippet_y'] == pair[1].strip())]
        return len(tmp) > 0

    def make_samples(nodes, scores):
        res = []
        for node in nodes:
            res.append((node[0], node[1], in_gold(node)))
        return res

    pairs = [(edus[i], edus[i+1]) for i in range(len(edus) - 1)]
    nodes = edus
    scores = [in_gold(pair) for pair in pairs] 
    training_set = make_samples(pairs, scores)
    result = []
    
    print('Start')
    counter = 0
    
    while len(edus) > counter:
        while True in scores:
            # select two nodes to merge
            j = to_merge_gr(scores)  # position of the left node
            new_du = nodes[j] + ' ' + nodes[j+1]
            nodes = nodes[:j] + [new_du] + nodes[j+2:]
            counter += 1

            # modify the scores list
            if j == 0:
                new_score = in_gold((nodes[j], nodes[j+1]))
                training_set.append((nodes[j], nodes[j+1], in_gold((nodes[j], nodes[j+1]))))
                scores = [new_score] + scores[j+2:]

            elif j+1 < len(nodes):
                new_score_left = in_gold((nodes[j-1], nodes[j]))
                new_score_right = in_gold((nodes[j], nodes[j+1]))

                training_set += [
                    (nodes[j-1], nodes[j], in_gold((nodes[j-1], nodes[j]))),
                    (nodes[j], nodes[j+1], in_gold((nodes[j], nodes[j+1])))
                ]

                scores = scores[:j-1] + [new_score_left, new_score_right] + scores[j+2:]

            else:
                new_score = in_gold((nodes[j-1], nodes[j]))
                training_set.append((nodes[j-1], nodes[j], in_gold((nodes[j-1], nodes[j]))))
                scores = scores[:j-1] + [new_score]
        # print(nodes, scores)
        return

    return list(set(training_set))

In [None]:
def extract_snippet_ids(snippet, edus):
    return [edu_nm for edu_nm, edu in enumerate(edus) if (edu in snippet)]


def check_snippet_pair_in_dataset(dataset, snippet_left, snippet_right):
    return ((((dataset.snippet_x == snippet_left) & (dataset.snippet_y == snippet_right)).sum(axis=0) != 0) 
            or ((dataset.snippet_y == snippet_left) & (dataset.snippet_x == snippet_right)).sum(axis=0) != 0)


def extract_negative_samples_for_snippet(gold, edus, snippet):
    training_set = []
    
    snippet_ids = extract_snippet_ids(snippet, edus)
    
    if not snippet_ids:
        return []
        
    if snippet_ids[0] > 0:
        if not check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[0] - 1]):
            training_set.append((edus[snippet_ids[0] - 1], snippet, False))

    if snippet_ids[-1] < len(edus) - 1:
        if not check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[-1] + 1]):
            training_set.append((snippet, edus[snippet_ids[-1] + 1], False))

    return training_set


def create_training_set(edus, gold):
    training_set = []
    
    snippet_cache = []
    for num, e in enumerate(gold.index):
        snippet_x = gold.loc[e, 'snippet_x']
        cache_x = extract_snippet_ids(snippet_x, edus)

        snippet_y = gold.loc[e, 'snippet_y']
        cache_y = extract_snippet_ids(snippet_y, edus)
                    
        if cache_x and cache_y:
            snippet_cache.append((cache_x, snippet_x))
            snippet_cache.append((cache_y, snippet_y))
            
#             if cache_x[0] < cache_y[0]:
#                 training_set.append((snippet_x, snippet_y, True))
#             else:
#                 training_set.append((snippet_y, snippet_x, True))
        
    for i in range(len(edus) - 1):
        if not check_snippet_pair_in_dataset(gold, edus[i], edus[i+1]):
            training_set.append((edus[i], edus[i+1], False))
    
    for i in gold.index:
        training_set += extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_x'])
        training_set += extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_y'])
    
    for i in range(len(snippet_cache)):
        for j in range(i, len(snippet_cache)):
            cache_i, snippet_i = snippet_cache[i]
            cache_j, snippet_j = snippet_cache[j]
            
            if cache_i[-1] + 1 == cache_j[0]:
                if not check_snippet_pair_in_dataset(gold, snippet_i, snippet_j):
                    training_set.append((snippet_i, snippet_j, False))
            
            if cache_j[-1] + 1 == cache_i[0]:
                if not check_snippet_pair_in_dataset(gold, snippet_j, snippet_i):
                    training_set.append((snippet_j, snippet_i, False))
    
    return list(set(training_set))

### Make negative samples, save them

In [None]:
files = sorted(glob.glob('./data/*.json'), key=lambda s: int(os.path.basename(s)[5]))
test = files[::5]
train = [file for file in files if not file in test]

In [None]:
from tqdm import tqdm_notebook as tqdm

for filename in tqdm(glob.glob('./data/*.json')):
    filename = filename.replace('.json', '')
    df = read_gold(filename)
    edus = read_edus(filename)
        
    new_set = create_training_set(edus, df)
    result = []
    for item in new_set:
        result.append((filename, item[0], item[1], item[2]))

    tmp = pd.DataFrame(result, columns=['filename', 'snippet_x', 'snippet_y', 'relation'])
    
    annot = read_annotation(filename)
    
    def place_locations(row):
        row['loc_x'] = annot['text'].find(row.snippet_x)
        row['loc_y'] = annot['text'][row['loc_x']:].find(row.snippet_y)
        return row

    tmp = tmp.apply(place_locations, axis=1)
    
    tmp.to_json(filename + '.json.neg')

### Make a directory

In [None]:
import os

model_path = 'models/structure_predictor'
! mkdir $model_path

drop_columns = ['snippet_x', 'snippet_y', 'snippet_x_tmp', 'snippet_y_tmp', 'postags_x', 'postags_y']
pickle.dump(drop_columns, open(os.path.join(model_path, 'drop_columns.pkl'), 'wb'))

### Extract features

In [None]:
%%time
from utils.features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='models', verbose=False)

#### Try on the sample

In [None]:
import json

filename = 'data/news_16'
edus = read_edus(filename)
gold = read_gold(filename)
annot = read_annotation(filename)
negatives = read_negative(filename)

%time result = features_processor(negatives, \
                            annot['text'],\
                            annot['tokens'],\
                            annot['sentences'],\
                            annot['lemma'],\
                            annot['morph'],\
                            annot['postag'],\
                            annot['syntax_dep_tree'])

In [None]:
result.shape

#### Extract features from negative examples

In [None]:
for filename in glob.glob("rst_pairs/*.json.neg"):
    filename = filename.replace('.json.neg', '')
    
    df = read_negative(filename)
    df = df[df.snippet_x.str.len() > 0]
    df = df[df.snippet_y.str.len() > 0]
    annotation = read_annotation(filename)
        
    try:
        result = features_processor(df, \
                                   annotation['text'],\
                                   annotation['tokens'],\
                                   annotation['sentences'],\
                                   annotation['lemma'],\
                                   annotation['morph'],\
                                   annotation['postag'],\
                                   annotation['syntax_dep_tree'])

        result.to_pickle(filename + '.neg.features')
    except IndexError:
        print('INDEX ERROR ::: FILENAME :::', filename)
        continue

#### As well as from gold examples 

In [None]:
for filename in glob.glob("rst_pairs/*.json"):
    filename = filename.replace('.json', '')
    
    df = read_gold(filename)
    df = df[df.snippet_x.str.len() > 0]
    df = df[df.snippet_y.str.len() > 0]
    annotation = read_annotation(filename)
        
    try:
        %time result = features_processor(df, \
                                   annotation['text'],\
                                   annotation['tokens'],\
                                   annotation['sentences'],\
                                   annotation['lemma'],\
                                   annotation['morph'],\
                                   annotation['postag'],\
                                   annotation['syntax_dep_tree'])

        result.to_pickle(filename + '.gold.features')
    except IndexError:
        print('INDEX ERROR ::: FILENAME :::', filename)
        continue

In [None]:
! ls rst_pairs/*.gold.features | wc -l

In [None]:
! ls rst_pairs/*.neg.features | wc -l

## Obtain data for training 

In [None]:
import glob
import os

files = sorted(glob.glob('./rst_pairs/*.edus'), key=lambda s: int(os.path.basename(s)[5]))
test = files[::5]
train = [file for file in files if not file in test]

In [None]:
print('news in train:', len([file for file in train if 'news' in file]) / len(train))
print('ling in train:', len([file for file in train if 'ling' in file]) / len(train))
print('comp in train:', len([file for file in train if 'comp' in file]) / len(train))

In [None]:
print('news in test:', len([file for file in test if 'news' in file]) / len(test))
print('ling in test:', len([file for file in test if 'ling' in file]) / len(test))
print('comp in test:', len([file for file in test if 'comp' in file]) / len(test))

In [None]:
random_state = 41

#### Make train set 

In [None]:
import pandas as pd

train_samples = []

for file in train:
    train_samples.append(pd.read_pickle(file.replace('.edus', '.gold.features')))
    try:
        train_samples.append(pd.read_pickle(file.replace('.edus', '.neg.features')))
    except FileNotFoundError as e:
        print(e)
        continue

train_samples = pd.concat(train_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
train_samples.relation = train_samples.relation.fillna(True)
train_samples['genre'] = train_samples.filename.map(lambda row: row.split('_')[0])

In [None]:
train_samples.keys()

In [None]:
train_samples.keys()

In [None]:
train_samples.relation.value_counts()

In [None]:
train_samples.shape

In [None]:
TARGET = 'relation'
y_train, X_train = train_samples[TARGET].to_frame(), train_samples.drop(TARGET, axis=1).drop(columns=['category_id', 'snippet_x', 'snippet_y', 'snippet_x_tmp', 'snippet_y_tmp', 'postags_x', 'postags_y', 'filename', 'order'])

In [None]:
X_train = X_train.drop(columns=categorical_cols)

In [None]:
X_train.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

categorical_feature_mask = X_train.dtypes==object
categorical_cols = X_train.columns[categorical_feature_mask].tolist()

le = LabelEncoder()
X_train[categorical_cols] = X_train[categorical_cols].apply(lambda col: le.fit_transform(col))

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_ohe = ohe.fit_transform(X_train[categorical_cols].values)
X_ohe = pd.DataFrame(X_ohe, X_train.index, columns=ohe.get_feature_names(categorical_cols))

X_train = X_train.join(
   pd.DataFrame(X_ohe, X_train.index).add_prefix('cat_'), how='right'
).drop(columns=categorical_cols)

In [None]:
X_train.shape

In [None]:
pickle.dump(categorical_cols, open(os.path.join(model_path, 'categorical_cols.pkl'), 'wb'))
pickle.dump(le, open(os.path.join(model_path, 'label_encoder.pkl'), 'wb'))
pickle.dump(ohe, open(os.path.join(model_path, 'one_hot_encoder.pkl'), 'wb'))

#### Make test set

In [None]:
import pandas as pd

In [None]:
random_state = 41

In [None]:
test_samples = []

for file in test:
    test_samples.append(pd.read_pickle(file.replace('.edus', '.gold.features')))
    try:
        test_samples.append(pd.read_pickle(file.replace('.edus', '.neg.features')))
    except FileNotFoundError as e:
        print(e)
        continue

test_samples = pd.concat(test_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
test_samples.relation = test_samples.relation.fillna(True)
test_samples['genre'] = test_samples.filename.map(lambda row: row.split('_')[0])

In [None]:
TARGET = 'relation'
y_test, X_test = test_samples[TARGET].to_frame(), test_samples.drop(TARGET, axis=1).drop(columns=['category_id', 'snippet_x', 'snippet_y', 'snippet_x_tmp', 'snippet_y_tmp', 'postags_x', 'postags_y', 'filename', 'order'])

In [None]:
X_test = X_test.drop(columns=categorical_cols)

In [None]:
categorical_cols = pickle.load(open('binary_classifier_models/categorical_cols.pkl', 'rb'))
le = pickle.load(open('binary_classifier_models/label_encoder.pkl', 'rb'))
ohe = pickle.load(open('binary_classifier_models/one_hot_encoder.pkl', 'rb'))

X_test[categorical_cols] = X_test[categorical_cols].apply(lambda col: le.fit_transform(col))

X_ohe = ohe.transform(X_test[categorical_cols].values)
X_ohe = pd.DataFrame(X_ohe, X_test.index, columns=ohe.get_feature_names(categorical_cols))

X_test = X_test.join(
   pd.DataFrame(X_ohe, X_test.index).add_prefix('cat_'), how='right'
).drop(columns=categorical_cols)

In [None]:
X_train.shape, X_test.shape

### Classifiers training 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler

std_scaler = MinMaxScaler().fit(X_train.values)

X_train = pd.DataFrame(std_scaler.transform(X_train.values), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(std_scaler.transform(X_test.values), index=X_test.index, columns=X_test.columns)

scaler_path = 'binary_classifier_models/scaler.pkl'
with open(scaler_path, 'wb') as f:
    pickle.dump(std_scaler, f)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


model = LogisticRegression(solver='lbfgs', class_weight='balanced', C=0.0005, n_jobs=4)
model.fit(X_train, y_train)

In [None]:
from sklearn import metrics

predicted = model.predict(X_test)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

In [None]:
predicted = model.predict(X_test)  
print('weighted f1: ', metrics.f1_score(y_test, predicted, average='weighted'))
print('macro f1: ', metrics.f1_score(y_test, predicted, average='macro'))
print()
print(metrics.classification_report(y_test, predicted))

In [None]:
pickle.dump(model, open(os.path.join(model_path, 'model.pkl'), 'wb'))
pickle.dump(std_scaler, open(os.path_join(model_path, 'scaler.pkl'), 'wb'))

In [None]:
lgbm_param_bin =  {
    'tree_learner': 'feature',
    'task': 'train',
    'random_state': random_state,
    'metric': 'binary_logloss',
    'feature_fraction': 0.8,
    'boosting_type': 'dart',
    'application': 'binary',
    'num_iterations': 300,
    'max_depth' : 5,
    'is_unbalance' : True,
    'n_estimators' : 300,
    'colsample_bytree' : 0.8
}
model = lgb.LGBMClassifier(**lgbm_param_bin)
model.fit(X_train, y_train)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

lgbm_param_bin =  {
    'tree_learner': 'feature',
    'task': 'train',
    'random_state': random_state,
    'metric': 'binary_logloss',
    'feature_fraction': 0.8,
    'boosting_type': 'dart',
    'application': 'binary',
    'num_iterations': 300,
    'max_depth' : 5,
    'is_unbalance' : True,
    'n_estimators' : 300,
    'colsample_bytree' : 0.8
}
classifier = lgb.LGBMClassifier(**lgbm_param_bin)
feature_selector = SelectFromModel(LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1'))
model = Pipeline([('feature_selector', feature_selector), 
                   ('classifier', classifier)])
model.fit(X_train, y_train)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
import lightgbm.sklearn as lgb

random_state = 41
lgbm_param_bin =  {
    'tree_learner': 'feature',
    'task': 'train',
    'random_state': random_state,
    'metric': 'binary_logloss',
    'feature_fraction': 0.8,
    'boosting_type': 'dart',
    'application': 'binary',
    'num_iterations': 600,
    'max_depth' : 6,
    'is_unbalance' : True,
    'n_estimators' : 600,
    'colsample_bytree' : 0.8
}
classifier = lgb.LGBMClassifier(**lgbm_param_bin)
feature_selector = SelectFromModel(LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1'))
model_single = Pipeline([('feature_selector', feature_selector), 
                   ('classifier', classifier)])

model = BaggingClassifier(base_estimator=model_single, 
                          n_estimators=3, 
                          max_samples=1.0, 
                          max_features=0.8, 
                          bootstrap=True, 
                          random_state=random_state)


#model.fit(X_train, y_train)

In [None]:
from sklearn import metrics

classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))  # here

In [None]:
import numpy as np

fi = np.array(classifier.feature_importances_)
sorted_idx = np.argsort(fi)
print(np.count_nonzero(fi))

In [None]:
pd.set_option('display.max_rows', 150)
#start, finish = 0, 2000
dd = pd.DataFrame({'Feature': np.array(X_test.keys())[sorted_idx], 'Importance': fi[sorted_idx][::-1]})
dd = dd[dd['Importance'] > 0]

In [None]:
dd[dd.Feature.str[-2:] != '_y']

In [None]:
from sklearn import metrics

classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

In [None]:
file_path = 'predictor_relation_presence_classifier.pkl'
with open(file_path, 'wb') as f:
    pickle.dump(classifier, f)

In [None]:
y_train.relation.value_counts()

In [None]:
from catboost import CatBoostClassifier, Pool

model = CatBoostClassifier(one_hot_max_size=5,
                           learning_rate=0.5,
                           iterations=200,
                           class_weights=[0.35, 1.],
                           depth=3,
                           #task_type="GPU"
                          )

model.fit(X_train,
          y_train.astype(float),
          eval_set=Pool(X_test, y_test.astype(float)),
          verbose=False,
          plot=True)

In [None]:
from sklearn import metrics

predicted = model.predict(X_test)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))