# Extract features

In [1]:
text_html_map = {
    r'\n': r' ',
    r'&gt;': r'>',
    r'&lt;': r'<',
    r'&amp;': r'&',
    r'&quot;': r'"',
    r'&ndash;': r'–',
    r'##### ': r'',
    r'\\\\\\\\': r'\\',
    r'  ': r' ',
    r'——': r'-',
    r'—': r'-',
    r'/': r'',
    r'\^': r'',
    r'^': r'',
    r'±': r'+'
}

def read_edus(filename):
    edus = []
    with open(filename + '.edus', 'r') as f:
        for line in f.readlines():
            edu = str(line.strip())
            for key, value in text_html_map.items():
                edu = edu.replace(key, value)
            edus.append(edu)
    return edus

def read_annotation(filename):
    annot = pd.read_pickle(filename + '.nlp')
    for key, value in text_html_map.items():
        annot['text'] = annot['text'].replace(key, value)
    return annot

In [19]:
annot.keys()

dict_keys(['text', 'tokens', 'sentences', 'postag', 'lemma', 'syntax_dep_tree', 'morph'])

In [20]:
from glob import glob
import pandas as pd


def annot2feat(annot):
    features = []

    for sentence in range(len(annot['sentences'])):
        sentence_features = []
        
        for number_inside_sentence, token in enumerate(range(annot['sentences'][sentence].begin, 
                                                             annot['sentences'][sentence].end)):
            token_text = annot['tokens'][token].text            
            new_token = {
                'lemma': annot['lemma'][sentence][number_inside_sentence],
                'token[-2:]': token_text[-2:].lower(),
                'token.isupper()': token_text.isupper(),
                'token.istitle()': token_text.istitle(),
                'token.isdigit()': token_text.isdigit(),
                'postag': annot['postag'][sentence][number_inside_sentence],
                'synttag': annot['syntax_dep_tree'][sentence][number_inside_sentence].link_name
            }
            
            if token > 0:
                new_token.update({
                    '-1:token[-2:]': annot['tokens'][token-1].text[-2:].lower(),
                    '-1:token.isupper()': annot['tokens'][token-1].text.isupper(),
                    '-1:token.istitle()': annot['tokens'][token-1].text.istitle(),
                    '-1:token.isdigit()': annot['tokens'][token-1].text.isdigit()
                })
                if number_inside_sentence > 0:
                    new_token.update({
                        '-1:lemma': annot['lemma'][sentence][number_inside_sentence-1],
                        '-1:postag': annot['postag'][sentence][number_inside_sentence-1],
                        '-1:synttag': annot['syntax_dep_tree'][sentence][number_inside_sentence-1].link_name
                    })
                else:
                    new_token['BOS'] = True
                    
            if token < annot['sentences'][sentence].end-1:
                new_token.update({
                    #'+1:token.lower()': annot['tokens'][token+1].text.lower(),
                    '+1:lemma': annot['lemma'][sentence][number_inside_sentence+1],
                    '+1:token[-2:]': annot['tokens'][token+1].text[-2:].lower(),
                    '+1:token.isupper()': annot['tokens'][token+1].text.isupper(),
                    '+1:token.istitle()': annot['tokens'][token+1].text.istitle(),
                    '+1:token.isdigit()': annot['tokens'][token+1].text.isdigit(),
                    '+1:postag': annot['postag'][sentence][number_inside_sentence+1],
                    '+1:synttag': annot['syntax_dep_tree'][sentence][number_inside_sentence+1].link_name
                })
            else:
                new_token['EOS'] = True                
                
            sentence_features.append(new_token)
        
        features.append(sentence_features)

    return features


# В rursttreebank элементарная дискурсивная единица может состоять из нескольких предложений
def annot2tags(annot, edus):
    tags = []
    cursor = 0
    
    for sentence in range(len(annot['sentences'])):
        sentence_tags = []
        
        for number_inside_sentence, token_number in enumerate(range(annot['sentences'][sentence].begin, 
                                                                    annot['sentences'][sentence].end)):
            # сравнивать последний токен в ЭДЕ с токеном, над которым указатель, 
            # и если они совпали, то проверять предыдущий текст ЭДЕ с соответствующим оригинальным текстом
            token = annot['tokens'][token_number]
            if len(edus[cursor]) < token.end-token.begin:
                is_last_token = False

            else:
                is_last_token = edus[cursor][token.begin-token.end:] == token.text
                if is_last_token:
                    if edus[cursor][token.begin-token.end-10:] == annot['text'][token.begin-10:token.end]:
                        cursor += 1
                        
            tag = 'EOU' if is_last_token else 'IU'
            sentence_tags.append(tag)
        
        tags.append(sentence_tags)
                    
    return tags

## Split data 

In [21]:
import glob
import os

files = sorted(glob.glob('rst_pairs/*.edus'), key=lambda s: int(os.path.basename(s)[5]))
test = files[::5]
train = [file for file in files if not file in test]

In [22]:
x_train = []
y_train = []

for file in train:
    filename = file[:file.rfind('.edus')]
    edus = read_edus(filename)
    annot = read_annotation(filename)
    features = annot2feat(annot)
    x_train += features
    tags = annot2tags(annot, edus)
    y_train += tags


In [23]:
x_test = []
y_test = []

for file in test:
    filename = file[:file.rfind('.edus')]
    edus = read_edus(filename)
    annot = read_annotation(filename)
    features = annot2feat(annot)
    x_test += features
    tags = annot2tags(annot, edus)
    y_test += tags


## Train models 

EOU - end of unit

IU - inside unit

In [24]:
from sklearn_crfsuite import CRF, scorers, metrics
from sklearn_crfsuite.metrics import flat_classification_report
import eli5

crf = CRF(algorithm='lbfgs',
          c1=0.7001583821135307,
          c2=0.15925817780060236,
          max_iterations=100,
          all_possible_transitions=True)

In [25]:
len(x_train[0]), len(y_train[0])

(11, 11)

In [26]:
crf1 = crf.fit(x_train, y_train)

In [27]:
from sklearn_crfsuite.metrics import flat_classification_report

report = flat_classification_report(y_pred=crf1.predict(x_test), y_true=y_test)
print(report)

              precision    recall  f1-score   support

         EOU       0.34      0.02      0.04      1565
          IU       0.97      1.00      0.98     49342

   micro avg       0.97      0.97      0.97     50907
   macro avg       0.65      0.51      0.51     50907
weighted avg       0.95      0.97      0.96     50907



In [30]:
from sklearn_crfsuite.metrics import flat_classification_report

report = flat_classification_report(y_pred=crf1.predict(x_train), y_true=y_train)
print(report)

              precision    recall  f1-score   support

         EOU       0.78      0.09      0.16      5281
          IU       0.98      1.00      0.99    235363

   micro avg       0.98      0.98      0.98    240644
   macro avg       0.88      0.54      0.57    240644
weighted avg       0.98      0.98      0.97    240644



In [29]:
eli5.show_weights(crf1, top=200)

From \ To,EOU,IU
EOU,-3.067,-0.587
IU,-0.34,1.694

Weight?,Feature
Weight?,Feature
+4.781,lemma:минченков
+3.773,lemma:маслова
+3.565,lemma:рыбина
+3.213,lemma:матвейкин
+2.993,lemma:гринько
+2.554,lemma:съемка
+2.505,lemma:кузнецов
+2.457,-1:lemma:омер
+2.457,lemma:бичер
+2.217,-1:lemma:сепаратист

Weight?,Feature
+4.781,lemma:минченков
+3.773,lemma:маслова
+3.565,lemma:рыбина
+3.213,lemma:матвейкин
+2.993,lemma:гринько
+2.554,lemma:съемка
+2.505,lemma:кузнецов
+2.457,-1:lemma:омер
+2.457,lemma:бичер
+2.217,-1:lemma:сепаратист

Weight?,Feature
+2.165,-1:lemma:речь
+2.065,token[-2:]:«
+1.943,token[-2:]:[
+1.834,token[-2:]:(
+1.827,-1:token[-2:]:ол
+1.723,-1:lemma:термин
+1.684,token[-2:]:/
+1.639,-1:lemma:контекст
+1.560,synttag:det
+1.560,-1:token[-2:]:ч


In [387]:
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn_crfsuite import CRF, scorers, metrics
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import classification_report, make_scorer
import scipy.stats
import eli5
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats

In [388]:
X = x_test + x_train
y = y_test + y_train

In [None]:
# define fixed parameters and parameters to search
crf3 = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='macro', labels=['IU', 'EOU'])

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X, y)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


In [390]:
print('Best parameters:', rs.best_params_)
print('Best CV score:', rs.best_score_)
print('Model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

Best parameters: {'c1': 0.7001583821135307, 'c2': 0.15925817780060236}
Best CV score: 0.49418972728084615
Model size: 0.01M


In [392]:
# We sort the tags a bit so that they appear in an orderly fashion in the classification report
sorted_labels = sorted(
    ['IU', 'EOU'],
    key=lambda name: (name[1:], name[0]))

In [393]:
#Now we create the model again using the best estimators

crf3 = rs.best_estimator_
y_pred = crf3.predict(X)

print(metrics.flat_classification_report(
    y, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

         EOU      0.667     0.000     0.001      6846
          IU      0.977     1.000     0.988    284705

   micro avg      0.977     0.977     0.977    291551
   macro avg      0.822     0.500     0.494    291551
weighted avg      0.969     0.977     0.965    291551



In [394]:
eli5.show_weights(crf3, top=100)

From \ To,EOU,IU
EOU,-2.881,-0.334
IU,-0.318,1.481

Weight?,Feature
Weight?,Feature
+0.803,synttag:punct
+0.716,synttag:cc
+0.474,token.isdigit()
+0.416,synttag:fixed
+0.326,-1:synttag:obl
+0.297,synttag:flat
+0.278,+1:token.isupper()
+0.244,postag:ADP
+0.225,-1:postag:VERB
+0.201,-1:synttag:obj

Weight?,Feature
0.803,synttag:punct
0.716,synttag:cc
0.474,token.isdigit()
0.416,synttag:fixed
0.326,-1:synttag:obl
0.297,synttag:flat
0.278,+1:token.isupper()
0.244,postag:ADP
0.225,-1:postag:VERB
0.201,-1:synttag:obj

Weight?,Feature
2.492,synttag:det
1.785,postag:NUM
1.389,BOS
1.247,postag:PART
1.099,postag:ADJ
1.098,synttag:aux:pass
0.977,postag:ADV
0.741,synttag:acl
0.733,postag:VERB
0.729,postag:PRON


In [401]:
eli5.explain_weights_df(crf3)

  .format(attr, ', '.join(other_attrs)))


Unnamed: 0,from,to,coef
0,EOU,EOU,-3.171644
1,EOU,IU,-0.505887
2,IU,EOU,-0.365159
3,IU,IU,1.65457
