# Extract features

In [None]:
from utils.file_reading import read_edus, read_annotation

In [None]:
from glob import glob
import pandas as pd


def extract_triplets(edus, annot):
    """ marker = start of edu """
    triplets = []
    cursor = 0
    
    for sentence in range(len(annot['sentences'])):
        for token in range(annot['sentences'][sentence].begin, annot['sentences'][sentence].end):
            marker = 0  # class label, 1 is for 'start of edu'
            start_of_sentence = 0
            
            if token == annot['sentences'][sentence].begin:
                start_of_sentence = 1
                if token > 0:
                    left_neighbour = (#annot['tokens'][token-1].text,
                                      annot['lemma'][sentence-1][-1],
                                      annot['postag'][sentence-1][-1],
                                      annot['syntax_dep_tree'][sentence-1][-1].link_name)
                    original_text = annot['text'][annot['tokens'][token].begin:annot['tokens'][token].end]
                else:
                    left_neighbour = ('', '', '')
                    original_text = annot['text'][annot['tokens'][token].begin:annot['tokens'][token].end]
            else:
                left_neighbour = (#annot['tokens'][token-1].text,
                                  annot['lemma'][sentence][token-1-annot['sentences'][sentence].begin],
                                  annot['postag'][sentence][token-1-annot['sentences'][sentence].begin],
                                  annot['syntax_dep_tree'][sentence][token-1-annot['sentences'][sentence].begin].link_name)
                original_text = annot['text'][annot['tokens'][token].begin:annot['tokens'][token].end]
                
            token_itself = (#annot['tokens'][token].text, 
                            int(annot['tokens'][token].text.istitle()),
                            annot['lemma'][sentence][token-annot['sentences'][sentence].begin],
                            annot['postag'][sentence][token-annot['sentences'][sentence].begin],
                            annot['syntax_dep_tree'][sentence][token-annot['sentences'][sentence].begin].link_name)
            
            if token == annot['sentences'][sentence].end-1:
                if token + 1 < len(annot['tokens']):
                    right_neighbour = (#annot['tokens'][token+1].text, 
                                       annot['lemma'][sentence+1][0],
                                        annot['postag'][sentence+1][0],
                                        annot['syntax_dep_tree'][sentence+1][0].link_name)
                    original_text += annot['text'][annot['tokens'][token].end:annot['tokens'][token].end]
                else:
                    right_neighbour = ('', '', '')
            else:
                right_neighbour = (#annot['tokens'][token+1].text, 
                                    annot['lemma'][sentence][token+1-annot['sentences'][sentence].begin],
                                   annot['postag'][sentence][token+1-annot['sentences'][sentence].begin],
                                   annot['syntax_dep_tree'][sentence][token+1-annot['sentences'][sentence].begin].link_name)
                original_text += annot['text'][annot['tokens'][token].end:annot['tokens'][token].end]
            
            if cursor < len(edus):
                if edus[cursor].startswith(original_text):
                    marker = 1
                    cursor += 1
                
            triplets.append(left_neighbour + token_itself + right_neighbour + (start_of_sentence, marker))
            del left_neighbour, token_itself, right_neighbour, marker
                
            if cursor > len(edus):
                break

    return triplets

### Split dataset as everywhere in this directory 

In [None]:
import glob
import os
from utils.train_test_split import split_data

train, test = split_data('data/', 0.2, seed=45)

In [None]:
triplets_bank = []
filenames = []

for file in train:
    filename = file[:file.rfind('.edus')]
    edus = read_edus(filename)
    annot = read_annotation(filename)
    triplets = extract_triplets(edus, annot)
    triplets_bank += triplets
    filenames += [file] * len(triplets)
    
train = pd.DataFrame(triplets_bank, columns=['left_token', 'left_pos', 'left_link', 
                                             'is_title', 'token', 'pos', 'link', 
                                             'right_token', 'right_pos', 'right_link', 
                                             'start_sentence', 'class'])
train['non_noun_tok'] = ((train['pos'] != 'NOUN') & (train['pos'] != 'VERB') & (train['pos'] != '')) * train['token']
train['filename'] = filenames

In [None]:
triplets_bank = []
filenames = []

for file in test:
    filename = file[:file.rfind('.edus')]
    edus = read_edus(filename)
    annot = read_annotation(filename)
    triplets = extract_triplets(edus, annot)
    triplets_bank += triplets
    filenames += [file] * len(triplets)
    
test = pd.DataFrame(triplets_bank, columns=['left_token', 'left_pos', 'left_link', 
                                            'is_title', 'token', 'pos', 'link', 
                                            'right_token', 'right_pos', 'right_link',
                                            'start_sentence', 'class'])
test['non_noun_tok'] = ((test['pos'] != 'NOUN') & (test['pos'] != 'VERB') & (test['pos'] != '')) * test['token']
test['filename'] = filenames

In [None]:
train.shape, test.shape

In [11]:
train[train['class'] == 1].head()

Unnamed: 0,left_token,left_pos,left_link,is_title,token,pos,link,right_token,right_pos,right_link,start_sentence,class,non_noun_tok,filename
0,,,,0,брюссель,NOUN,nsubj,–,,punct,1,1,,data/news1_23.edus
2,–,,punct,1,в,ADP,case,этот,PRON,det,0,1,в,data/news1_23.edus
10,",",,punct,0,однако,CONJ,cc,всемирный,ADJ,amod,0,1,однако,data/news1_23.edus
28,.,,punct,1,этот,PRON,det,встреча,NOUN,nsubj,1,1,этот,data/news1_23.edus
31,",",,punct,0,который,PRON,nsubj,быть,VERB,aux,0,1,который,data/news1_23.edus


In [12]:
train['class'].value_counts()

0    366511
1     26775
Name: class, dtype: int64

### Embed 

In [None]:
from gensim.models import Word2Vec, KeyedVectors

embed_model_path='models/w2v/segmentator/model2_tokenized'
word2vec_model = Word2Vec.load(embed_model_path)

In [None]:
import numpy as np

def get_embeddings(embedder, word):
    try:
        return embedder[word.lower()]
    except KeyError:
        return np.zeros(embedder.vector_size)

In [None]:
tag_for_embeddings = False

if tag_for_embeddings:
    train['e_left'] = train.apply(lambda row: get_embeddings(word2vec_model, '_'.join([row.left_token, row.left_pos])), axis=1)
else:
    train['e_left'] = train.left_token.map(lambda row: get_embeddings(word2vec_model, row))
    train['e_token'] = train.token.map(lambda row: get_embeddings(word2vec_model, row))
    train['e_right'] = train.right_token.map(lambda row: get_embeddings(word2vec_model, row))
    test['e_left'] = test.left_token.map(lambda row: get_embeddings(word2vec_model, row))
    test['e_token'] = test.token.map(lambda row: get_embeddings(word2vec_model, row))
    test['e_right'] = test.right_token.map(lambda row: get_embeddings(word2vec_model, row))

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
import pickle
    
#not_categ_features = {'arg_address', 'ex_id', 'rel_pos'}

categ_feats = ['left_pos', 'left_link',
               'pos', 'link',
               'right_pos', 'right_link',
               'non_noun_tok'
              ]

print('Category features:', categ_feats)
#print('Not category features:\n', not_categ)

vectorizer = DictVectorizer(sparse=False)
vectorizer.fit(train[categ_feats].to_dict(orient='records'))
one_hot_feats = vectorizer.transform(train[categ_feats].to_dict(orient='records'))
print('shape of one hot transformed features:', one_hot_feats.shape)

main_model_path = 'models/segmentator/'
! mkdir $main_model_path

with open(main_model_path + 'vectorizer.pckl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [None]:
with open(main_model_path + 'category_features.pckl', 'wb') as f:
    pickle.dump(categ_feats, f)

In [None]:
one_hot_feats.shape

In [None]:
not_categ = ['left_token', 'token', 'right_token', 'class', 'e_left', 'e_token', 'e_right']

In [None]:
one_hot_test = vectorizer.transform(test[categ_feats].to_dict(orient='records'))

### Construct some models

In [None]:
import keras
import tensorflow as tf
from keras import backend as K
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.layers import Activation
from keras.layers import BatchNormalization
from keras.layers import Dropout
from keras.layers import Input, Dense, concatenate, Conv1D, Conv2D, BatchNormalization, Activation, MaxPooling1D, \
    MaxPooling2D, Dropout, GlobalMaxPool2D, Flatten, Bidirectional, Conv1D, GlobalMaxPool1D, GlobalMaxPooling1D, \
    GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.layers import LSTM
from keras.layers import Lambda
from keras.layers import Permute
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import merge
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.models import Model
from keras.models import Sequential
from keras.models import load_model
from keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.regularizers import l2, l1
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score

In [None]:
# Use only one GPU
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../isanlp/src/')
sys.path.append('../../src/isanlp_srl_framebank/')
sys.path.append('../../libs/')
sys.path.append('../../libs/pylingtools/')

# Supress tensorflow memory appetites

import tensorflow as tf
print(tf.__version__)

from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def construct_simple_model(input_length, plain_length, output_length):
    inner_size = 80
    activation='tanh'
    dropout = .4
    
    input_token = Input(shape=(input_length,))
    l1 = BatchNormalization()(input_token)
    input_left = Input(shape=(input_length,))
    l2 = BatchNormalization()(input_left)
    input_right = Input(shape=(input_length,))
    l3 = BatchNormalization()(input_right)
    input_plain = Input(shape=(plain_length,))
    l4 = BatchNormalization()(input_plain)
    
    l4 = Dense(int(inner_size * 10))(l4)
    l4 = BatchNormalization()(l4)
    l4 = Activation(activation)(l4)
    l4 = Dropout(dropout)(l4)
    x = concatenate([l1, l2, l3, l4], axis=-1)
    x = Dense(120)(x)
    x = BatchNormalization()(x)
    x = Activation(activation)(x)
    x = Dropout(dropout)(x)
    
    outputs = Dense(2, activation='softmax')(x)
    
    model = Model(inputs=[input_token, input_left, input_right, input_plain], outputs=outputs)
    model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
e_left_train = np.stack(train.e_left.values)
e_token_train = np.stack(train.e_token.values)
e_right_train = np.stack(train.e_right.values)

e_left_test = np.stack(test.e_left.values)
e_token_test = np.stack(test.e_token.values)
e_right_test = np.stack(test.e_right.values)

In [None]:
from keras.utils import to_categorical

y_train = to_categorical(train['class'].values)
y_test = to_categorical(test['class'].values)

In [None]:
model = construct_simple_model(input_length=e_left_train.shape[1],
                               plain_length=one_hot_feats.shape[1],
                               output_length=2)
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, 
                               mode='auto', restore_best_weights=True)

history = model.fit(x=[e_token_train, e_left_train, e_right_train, one_hot_feats], 
                    y=y_train, epochs=200, batch_size=512, 
                    validation_data=([e_token_test, e_left_test, e_right_test, one_hot_test], y_test),
                    shuffle=True, callbacks = [early_stopping,],
                    class_weight={0:1, 1:14})

In [None]:
predicted = model.predict([e_token_test, e_left_test, e_right_test, one_hot_test])  

In [None]:
pr_classes = np.argmax(predicted, axis=1)

print('pr:', precision_score(test['class'].values, pr_classes))
print('re:', recall_score(test['class'].values, pr_classes))
print('f1:', f1_score(test['class'].values, pr_classes))
print()
print(classification_report(y, pr_classes))

### Add smote 

In [None]:
! pip install -U imblearn

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X = [np.concatenate([e_left_train[i], e_token_train[i], e_right_train[i], one_hot_feats[i]]) 
     for i in range(len(np.argmax(y_train, axis=1)))]
X_res, y_res = sm.fit_resample(X, y_train)

In [None]:
e_left_over = [emb[:100] for emb in X_res]
e_token_over = [emb[100:200] for emb in X_res]
e_right_over = [emb[200:300] for emb in X_res]
plain_over = [emb[300:] for emb in X_res]
y_over = to_categorical(y_res)

In [None]:
model = construct_simple_model(input_length=e_left_train.shape[1],
                               plain_length=one_hot_test.shape[1],
                              output_length=2)
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=0, 
                               mode='auto', restore_best_weights=True)

history = model.fit(x=[e_token_over, e_left_over, e_right_over, plain_over], 
                    y=y_over, epochs=25, batch_size=512,# validation_split=0.1, 
                    validation_data=([e_token_test, e_left_test, e_right_test, one_hot_test], y_test),
                    #, one_hot_test], y_test),
                    shuffle=True, callbacks = [early_stopping])

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='models/segmentator/model.png', show_shapes=True, show_layer_names=False)

In [518]:
model.save('models/segmentator/neural_model.h5')

In [3]:
from tensorflow.python.keras.models import load_model
import os

model = load_model(os.path.join('models', 'segmentator', 'neural_model.h5'))

In [36]:
def triplets_to_text(data):
    text = []
    for i, row in data.iterrows():
        if row['class']:
            text.append('\n')
        text.append(row.token)
    return ' '.join(text)

In [37]:
result = test

In [39]:
predicted = model.predict([e_token_test, e_left_test, e_right_test, one_hot_test])  
pr_classes = np.argmax(predicted, axis=1)
result['class'] = pr_classes

In [40]:
plain = triplets_to_text(result)

In [42]:
with open('temp.temp', 'w') as f:
    f.write(plain)

In [47]:
temp = [unit.strip() for unit in plain.split('\n')]

In [51]:
print('\n>>> '.join(temp[15:30]))  # predicted

по их мнение ,
>>> чувство ненависть
>>> и являться основной орудие вербовка .
>>> при это отправитель письмо напрямую связывать удар американский беспилотник с серия теракт в париж ,
>>> который происходить 13 ноябрь 2015 год .
>>> " мы не мочь спокойно сидеть
>>> и наблюдать за такой трагедия , как атака в париж ,
>>> знать ,
>>> какой разрушительный последствие за рубеж
>>> и дома иметь программа использование бпло " ,
>>> - говориться в открытый письмо экс - военный .
>>> американский программа опосредовать борьба с терроризм в страна африка и ближний восток с самый начало вызывать критика мировой сообщество в связь с многочисленный нарушение международный норма и неотъемлемый право человек .
>>> по официально неподтвержденный данные , до 90 % человек ,
>>> убивать
>>> в результат атака беспилотник ,


### Parse plain text 

In [None]:
sample = annot['text']

In [None]:
sample = pd.DataFrame(extract_triplets(annot['text'], annot), 
                      columns=['left_token', 'left_pos', 'left_link', 
                               'is_title', 'token', 'pos', 'link', 
                               'right_token', 'right_pos', 'right_link', 'class'])

In [None]:
sample

In [None]:
categ_feats

In [None]:
one_hot_sample = vectorizer.transform(sample[categ_feats].to_dict(orient='records'))
print(one_hot_sample.shape)

In [None]:
tag_for_embeddings = False

sample['e_left'] = sample.left_token.map(lambda row: get_embeddings(word2vec_model, row))
sample['e_token'] = sample.token.map(lambda row: get_embeddings(word2vec_model, row))
sample['e_right'] = sample.right_token.map(lambda row: get_embeddings(word2vec_model, row))

In [None]:
embed_left = np.stack(sample['e_left'].values)
embed_lemma = np.stack(sample['e_token'].values)
embed_right = np.stack(sample['e_right'].values)

In [None]:
not_categ_columns = np.concatenate(tuple(sample.loc[:, e].values.reshape(-1, 1) for e in not_categ), axis =1)
plain_features = np.concatenate((one_hot_feats, not_categ_columns), axis = 1)
plain_features.shape

In [None]:
one_hot_sample[:10]

In [None]:
sample['e_left'].values.to_array()

In [None]:
one_hot_sample.shape

In [None]:
predicted = model.predict([embed_left, embed_lemma, embed_right, one_hot_sample], batch_size=120)  

In [None]:
sample

In [None]:
sample['class'] = predicted

In [None]:
sample['class'].describe()

In [None]:
sample['class'] = sample['class'].map(lambda row: row > 0.7)

In [None]:
sample['class']

In [None]:
def triplets_to_text(data):
    text = []
    for i, row in data.iterrows():
        text.append(row.token)
        if row['class']:
            text.append('\n')
    return ' '.join(text)

In [None]:
plain = triplets_to_text(sample)

In [None]:
print(plain)