In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

Scripts for converting isanlp.DiscourseUnit RST representation to ``dummy_format_data`` with the following keys:
- InputDocs : list of lists with plain tokens of each document
- EduBreak_TokenLevel : list of lists with the token positions of right EDU ends of each document
- SentBreak : self-describing
- Docs_structure : list of lists with the descriptions of binary relations in the document tree in a format such as ``(2:Satellite=Cause:2,3:Nucleus=Cause)  # EDU on left, EDU on right
(1:Nucleus=span:1,2:Satellite=Elaboration:3)  # one is EDU, other is not
(1:Nucleus=Sequence:3,4:Nucleus=Sequence:6)  # nether of them is EDU``

We need two data directories for this notebook:
- ``corpus_du/`` containing pickled RST corpus in isanlp.DiscourseUnit format (*.du)
- ``annots/`` containing pickled isanlp annotation (texts, tokens, sentences, etc.) (*.annot.pkl)

In [2]:
import os
import glob
import pandas as pd
import numpy as np
import pickle
import re
from tqdm import tqdm
from gensim.models import KeyedVectors
from tqdm import tqdm

In [30]:
get_edu_breaks(trees, annot)

[10, 16, 28, 38, 64, 76, 86, 107, 122]

In [31]:
annot['text']

'Третий урок состоит в том, что Афганистан можно бросить, но он не бросит вас, если вы уже ввязались в «большую игру» в нем. Афганистан настиг Россию в 1992 году на таджикско-афганской границе. Заключение ДКБ (Договора о коллективной безопасности) в мае 1992 года в Ташкенте было во многом реакцией на «экспорт Афганистана» в СНГ. А потому в течение всех 90-х годов рефреном звучит один тезис: нам необходимо тем или иным способом присутствовать в Афганистане. И Россия (несмотря на всю сегодняшнюю пропаганду о «годах провала») пыталась держать «руку на пульсе» (поддерживая «панджшерского льва» Ахмад Шах Масуда и объединенный фронт антиталибовской оппозиции).'

In [36]:
def get_input_docs(doc_annot: dict):
    """ InputDocs : list of lists with plain tokens of each document """
    return [token.text for token in doc_annot.get('tokens')]

def get_edu_breaks(doc_trees: list, doc_annot: dict):
    """ EduBreak_TokenLevel : list of lists with the token positions of right EDU ends of each document """
    
    def extr_edus(tree, begin):
        if tree.relation == 'elementary':
            return [(tree.start-begin, tree.end-begin)]
        else:
            tt = []
            tt += extr_edus(tree.left, begin=begin)
            tt += extr_edus(tree.right, begin=begin)
        return tt
    
    def map_offset_to_tokens(offset):
        begin, end = -1, -1
        for i, token in enumerate(doc_annot['tokens']):
            if begin == -1 and token.begin > offset[0]:
                begin = i - 1
            if begin != -1:
                if token.end > offset[1]:
                    end = i - 1
                    return begin, end
        return begin, i-1
    
    edus = []
    for tree in doc_trees:
        begin = tree.start
        edus += extr_edus(tree, begin=tree.start)
    
    return [map_offset_to_tokens(offset)[1] for offset in edus]

def get_sentence_breaks(doc_annot: dict):
    """ SentBreak for sentence breaks in terms of token offsets """
    return [sentence.end - 1 for sentence in annot.get('sentences')]

def leftmostid(tree):
    if tree.left:
        return leftmostid(tree.left)
    return tree.id

def rightmostid(tree):
    if tree.right:
        return rightmostid(tree.right)
    return tree.id

# ['antithesis_NS', 'attribution_NS', 'attribution_SN', 'background_NS', 'background_SN', 
# 'cause-effect_NS', 'cause-effect_SN', 'cause_NS', 'cause_SN', 
# 'comparison_NN', 'concession_NS', 'concession_SN', 'conclusion_NS', 'conclusion_SN', 
# 'condition_NS', 'condition_SN', 'contrast_NN', 'effect_NS', 'effect_SN', 
# 'elaboration_NS', 'elaboration_SN', 'evaluation_NS', 'evaluation_SN', 'evidence_NS', 
# 'evidence_SN', 'interpretation-evaluation_NS', 'interpretation-evaluation_SN', 'interpretation_NS', 
# 'joint_NN', 'preparation_SN', 'purpose_NS', 'purpose_SN', 'restatement_NN', 'same-unit_NN', 
# 'sequence_NN', 'solutionhood_NS', 'solutionhood_SN', 'span_NS']


true_relations = ['attribution_NS', 'attribution_SN', 'background_NS',
       'cause-effect_NS', 'cause-effect_SN',
       'comparison_NN', 'concession_NS', 'condition_NS', 'condition_SN',
       'contrast_NN', 'elaboration_NS', 'evidence_NS',
       'interpretation-evaluation_NS', 'interpretation-evaluation_SN',
       'joint_NN', 'preparation_SN', 'purpose_NS', 'purpose_SN',
       'restatement_NN', 'same-unit_NN', 'sequence_NN', 'solutionhood_SN']

def correct_relations(rel: str, nuc: str):
    target_map = {
        'relation': 'joint',
        'antithesis': 'contrast',
        'cause': 'cause-effect',
        'effect': 'cause-effect',
        'conclusion': 'restatement',
        'interpretation': 'interpretation-evaluation',
        'evaluation': 'interpretation-evaluation',
        'motivation': 'condition',
        'span': 'attribution'  # somehow it is the case for two relation in dataset
    }
    
    if rel in target_map:
        rel = target_map.get(rel)
        
    relation_map = {
        'restatement_SN': 'restatement_NN',
        'restatement_NS': 'restatement_NN',
        'contrast_SN': 'contrast_NN',
        'contrast_NS': 'contrast_NN',
        'solutionhood_NS': 'elaboration_NS',
        'preparation_NS': 'elaboration_NS',
        'concession_SN': 'preparation_SN',
        'evaluation_SN': 'preparation_SN',
        'elaboration_SN': 'preparation_SN',
        'evidence_SN': 'preparation_SN',
        'background_SN': 'preparation_SN'
    }
    
    full_rel = rel + '_' + nuc
    if full_rel in relation_map:
        full_rel = relation_map.get(full_rel)
        rel, nuc = full_rel.split('_')
        
    if not full_rel in true_relations:
        print(rel, nuc, full_rel)
    
    return rel, nuc

def du_to_docs_structure(tree: dict, du_counter: int):

        
    if tree.relation != 'elementary':
        tree.relation, tree.nuclearity = correct_relations(tree.relation, tree.nuclearity)
        
        left_nuclearity = 'Satellite' if tree.nuclearity == 'SN' else 'Nucleus'
        right_nuclearity = 'Satellite' if tree.nuclearity == 'NS' else 'Nucleus'
        left_relation = tree.relation
        right_relation = tree.relation
        
        left_id_1 = leftmostid(tree.left) + du_counter
        left_id_2 = rightmostid(tree.left) + du_counter
        right_id_1 = leftmostid(tree.right) + du_counter
        right_id_2 = rightmostid(tree.right) + du_counter
            
        if left_nuclearity == 'Satellite':
            right_relation = 'span'
        
        if right_nuclearity == 'Satellite':
            left_relation = 'span'
            
        relstring_l = f'{left_id_1}:{left_nuclearity}={left_relation}:{left_id_2}'
        relstring_r = f'{right_id_1}:{right_nuclearity}={right_relation}:{right_id_2}'

        left_subtree_struct = du_to_docs_structure(tree.left, du_counter) or []
        right_subtree_struct = du_to_docs_structure(tree.right, du_counter) or []
        return [f'({relstring_l},{relstring_r})'] + left_subtree_struct + right_subtree_struct
    
    
def collect_edus(docs_structure):
    edus_id = []
    for entry in docs_structure:
        left, right = entry.split(',')
        left = left.replace('(', '').split(':')
        du1, du2 = left[0], left[2]
        if du1 == du2:
            edus_id.append(int(du1))
        
        right = right.replace(')', '').split(':')
        du1, du2 = right[0], right[2]
        if du1 == du2:
            edus_id.append(int(du1))
    return edus_id
    
def get_docs_structure(doc_trees: list):
    result = []
    du_counter = 0
    for tree in doc_trees:
        structure = du_to_docs_structure(tree, du_counter)
        if structure:
            result += structure
            du_counter += len(structure)
        else:
            du_counter += 1
    return result

In [37]:
train_filenames = ['news2_47', 'blogs_0', 'news2_14', 'news1_39', 'blogs_14',
       'blogs_27', 'news2_41', 'blogs_45', 'news1_70', 'blogs_71',
       'blogs_84', 'blogs_8', 'blogs_16', 'blogs_92', 'blogs_28',
       'news2_26', 'blogs_4', 'blogs_46', 'blogs_44', 'news1_41',
       'news1_21', 'news2_29', 'news1_33', 'blogs_48', 'blogs_88',
       'news1_18', 'blogs_43', 'news2_22', 'blogs_96', 'news2_18',
       'news2_39', 'news2_42', 'blogs_97', 'news2_31', 'news2_2',
       'blogs_23', 'blogs_82', 'news2_36', 'blogs_18', 'news2_28',
       'blogs_90', 'blogs_51', 'blogs_36', 'news1_49', 'blogs_98',
       'news1_51', 'news2_12', 'news1_54', 'blogs_73', 'blogs_50',
       'news2_9', 'blogs_3', 'blogs_34', 'blogs_32', 'blogs_13',
       'news1_45', 'news1_13', 'news1_10', 'blogs_62', 'blogs_66',
       'blogs_74', 'blogs_29', 'blogs_77', 'blogs_65', 'blogs_53',
       'blogs_55', 'blogs_7', 'blogs_67', 'news2_40', 'news2_46',
       'news1_35', 'blogs_95', 'news2_32', 'news1_72', 'news1_9',
       'blogs_93', 'blogs_58', 'news1_17', 'news2_27', 'news1_58',
       'news2_24', 'news1_46', 'blogs_37', 'blogs_25', 'blogs_81',
       'news1_38', 'blogs_35', 'blogs_59', 'blogs_2', 'news1_56',
       'blogs_24', 'blogs_94', 'news2_45', 'blogs_75', 'news1_14',
       'news2_25', 'blogs_11', 'blogs_80', 'blogs_40', 'news1_52',
       'news1_32', 'news2_33', 'news1_71', 'blogs_12', 'blogs_38',
       'blogs_70', 'news2_5', 'news2_20', 'news2_15', 'blogs_87',
       'blogs_56', 'blogs_78', 'blogs_91', 'news1_4', 'news1_3',
       'blogs_85', 'news1_62', 'blogs_68', 'blogs_47', 'news1_26',
       'blogs_6', 'news1_34', 'blogs_41', 'blogs_42', 'news1_30',
       'news1_61', 'news2_17', 'news1_55', 'news1_48', 'news2_37',
       'news1_69', 'news1_7', 'news2_7', 'news1_63', 'news1_73',
       'news2_3', 'blogs_102', 'news1_53', 'news2_19', 'news1_43',
       'blogs_101', 'news1_65', 'news1_27', 'news2_6', 'news1_8',
       'news1_37', 'blogs_1', 'news1_11', 'news1_64', 'news2_8',
       'blogs_61', 'news2_35', 'news1_16', 'blogs_89', 'news1_67',
       'news1_1', 'blogs_79', 'news1_31', 'news1_66', 'news1_5',
       'news2_10', 'news1_12', 'news1_74', 'news1_68', 'news1_15',
       'news1_75', 'news2_0', 'news1_2']

dev_filenames = ['blogs_26', 'blogs_9', 'blogs_19', 'news2_43', 'blogs_15',
       'blogs_54', 'news1_20', 'news1_22', 'blogs_22', 'news1_44',
       'blogs_33', 'blogs_100', 'blogs_103', 'blogs_10', 'blogs_20',
       'news1_60', 'news2_44', 'news1_36', 'news1_59', 'news2_23',
       'news2_30', 'blogs_30', 'news2_11', 'news1_6', 'blogs_49',
       'news1_50', 'news2_1', 'news1_79', 'news2_13', 'blogs_64',
       'news1_76', 'blogs_83', 'news2_49', 'blogs_57', 'news2_21',
       'blogs_5', 'blogs_76', 'news1_19', 'news1_40', 'news1_57']

test_filenames = ['blogs_21', 'news2_34', 'blogs_52', 'blogs_17', 'news1_47',
       'blogs_99', 'blogs_72', 'blogs_60', 'news1_25', 'news1_28',
       'blogs_63', 'news2_38', 'news2_48', 'blogs_69', 'blogs_39',
       'blogs_31', 'news1_23', 'blogs_86', 'news1_78', 'news2_4',
       'news1_77', 'news1_24', 'news1_29', 'news1_42', 'news2_16']

In [38]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp.processor_razdel import ProcessorRazdel


ppl = PipelineCommon([
    (ProcessorRazdel(), ['text'],
    {'tokens': 'tokens',
     'sentences': 'sentences'}),
])

In [39]:
train = {
    'InputDocs': [],
    'EduBreak_TokenLevel': [],
    'SentBreak': [],
    'Docs_structure': [],
}

dev = {
    'InputDocs': [],
    'EduBreak_TokenLevel': [],
    'SentBreak': [],
    'Docs_structure': [],
}

test = {
    'InputDocs': [],
    'EduBreak_TokenLevel': [],
    'SentBreak': [],
    'Docs_structure': [],
}

for file in tqdm(glob.glob('corpus_du/*.du')):
    
    trees = [pickle.load(open(file, 'rb'))]
    annot = ppl(trees[0].text)
    
    edus = get_edu_breaks(trees, annot)
    if len(edus) > 1:
        clear_filename = file.replace('corpus_du/', '').split('_part_')[0]
        if clear_filename in train_filenames:    
            train['InputDocs'].append(get_input_docs(annot))
            train['EduBreak_TokenLevel'].append(get_edu_breaks(trees, annot))
            train['SentBreak'].append(get_sentence_breaks(annot))
            train['Docs_structure'].append(get_docs_structure(trees))

        elif clear_filename in dev_filenames:
            dev['InputDocs'].append(get_input_docs(annot))
            dev['EduBreak_TokenLevel'].append(get_edu_breaks(trees, annot))
            dev['SentBreak'].append(get_sentence_breaks(annot))
            dev['Docs_structure'].append(get_docs_structure(trees))

        elif clear_filename in test_filenames:
            test['InputDocs'].append(get_input_docs(annot))
            test['EduBreak_TokenLevel'].append(get_edu_breaks(trees, annot))
            test['SentBreak'].append(get_sentence_breaks(annot))
            test['Docs_structure'].append(get_docs_structure(trees))

100%|██████████| 2703/2703 [00:03<00:00, 793.61it/s]


In [40]:
len(train['InputDocs'])

1715

In [41]:
def count_rels(corpus):
    for line in corpus:
        line = line[1:-1].split(':')
        left_id1, left_nuc, left_rel

In [42]:
dd = train['Docs_structure'][0]
dd

['(1:Nucleus=span:4,5:Satellite=elaboration:11)',
 '(1:Nucleus=span:1,2:Satellite=elaboration:4)',
 '(2:Nucleus=span:2,3:Satellite=cause-effect:4)',
 '(3:Nucleus=joint:3,4:Nucleus=joint:4)',
 '(5:Nucleus=joint:7,8:Nucleus=joint:11)',
 '(5:Satellite=attribution:5,6:Nucleus=span:7)',
 '(6:Nucleus=comparison:6,7:Nucleus=comparison:7)',
 '(8:Nucleus=span:10,11:Satellite=concession:11)',
 '(8:Nucleus=same-unit:9,10:Nucleus=same-unit:10)',
 '(8:Nucleus=span:8,9:Satellite=attribution:9)']

In [43]:
train['EduBreak_TokenLevel']

[[24, 27, 30, 33, 43, 54, 68, 72, 76, 86, 101],
 [17, 24, 34, 50, 74, 89, 96, 100, 105, 112, 118, 125, 134, 153, 156],
 [12, 19, 32, 34, 43, 50, 65, 72, 77, 86, 105],
 [0, 2, 15, 31, 34, 37, 41, 55, 60],
 [9, 12, 20, 0, 29, 35],
 [9,
  18,
  33,
  36,
  46,
  49,
  52,
  61,
  64,
  69,
  72,
  79,
  89,
  94,
  101,
  117,
  127,
  133,
  146,
  152,
  158,
  167],
 [15, 35, 42, 44, 63, 70, 78, 86, 91, 98],
 [3, 9, 15, 22, 29, 55, 59, 75],
 [1, 0, 17, 32, 54, 59, 66, 72, 88, 103, 108, 126, 134, 151, 156, 169, 178],
 [2, 17, 28, 36, 38, 41, 48, 55, 62, 70, 74],
 [27, 52],
 [6,
  19,
  32,
  38,
  45,
  50,
  56,
  69,
  76,
  84,
  88,
  93,
  97,
  102,
  108,
  116,
  147,
  153,
  161,
  173,
  180,
  184,
  194,
  199],
 [7, 15, 23, 25, 31, 37, 45, 50, 57, 71],
 [8, 16, 25, 45, 57],
 [13, 20, 32, 35, 40, 46, 55, 65, 70, 84, 90, 97, 105, 140],
 [10, 24, 30, 49, 56, 65, 76, 82, 88, 97, 102, 112, 113],
 [8, 11, 40, 56],
 [13, 33, 38, 42, 59],
 [6, 19, 21, 27, 34, 41, 47],
 [7,
  18,
 

In [44]:
for i, struct in enumerate(train['Docs_structure']):
    if not struct:
        print(train['EduBreak_TokenLevel'][i])

In [45]:
def collect_edus(docs_structure):
    edus_id = []
    for entry in docs_structure:
        left, right = entry.split(',')
        left = left.replace('(', '').split(':')
        du1, du2 = left[0], left[2]
        if du1 == du2:
            edus_id.append(int(du1))
        
        right = right.replace(')', '').split(':')
        du1, du2 = right[0], right[2]
        if du1 == du2:
            edus_id.append(int(du1))
    return edus_id

In [46]:
max(collect_edus(dd))

11

In [48]:
output_dir = '../processed_data'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
    
with open(os.path.join(output_dir, 'train_approach1'), 'wb') as f:
    pickle.dump(train, f)
    
with open(os.path.join(output_dir, 'dev_approach1'), 'wb') as f:
    pickle.dump(dev, f)
    
with open(os.path.join(output_dir, 'test_approach1'), 'wb') as f:
    pickle.dump(test, f)

Word2vec: lowercase, remove postags

In [14]:
with open('../data/model.txt', 'r') as f:
    lines = f.readlines()

In [15]:
! cp ../data/model.txt ../data/model_bckp.txt

In [20]:
with open('../data/model.txt', 'w') as f:
    for line in tqdm(lines):
        new_line = line.strip()
        key = new_line.split(' ')[0]
        value = new_line.split(' ')[1:]
        if '_' in key:
            key = key.split('_')[0]
        new_line = (key + '\t' + ' '.join(value)).lower()    
        f.write(new_line + '\n')

100%|██████████| 249334/249334 [00:10<00:00, 23903.95it/s]


In [20]:
! cat ../data/model.txt | wc -l

249334


In [19]:
! nvidia-smi

Fri Dec 17 14:41:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  On   | 00000000:01:00.0 Off |                  N/A |
| 61%   68C    P2    72W / 250W |   7888MiB / 11016MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  On   | 00000000:04:00.0 Off |                  N/A |
| 54%   65C    P2   176W / 250W |   5156MiB / 11019MiB |     59%      Defaul

In [22]:
# -*- coding: utf-8 -*-

import torch


class Embedding(object):

    def __init__(self, tokens, vectors, unk=None):
        self.tokens = tokens
        self.vectors = torch.tensor(vectors)
        self.pretrained = {w: v for w, v in zip(tokens, vectors)}
        self.unk = unk

    def __len__(self):
        return len(self.tokens)

    def __contains__(self, token):
        return token in self.pretrained

    @property
    def dim(self):
        return self.vectors.size(1)

    @property
    def unk_index(self):
        if self.unk is not None:
            return self.tokens.index(self.unk)
        else:
            raise AttributeError

    @classmethod
    def load(cls, path, unk=None):
        with open(path, 'r') as f:
            lines = [line for line in f][1:]
        splits = [line.split() for line in lines]
        tokens, vectors = zip(*[(s[0], list(map(float, s[1:])))
                                for s in splits])

        return cls(tokens, vectors, unk=unk)


In [23]:
emb = Embedding.load('../data/model.txt')

In [26]:
for tok in emb.tokens:
    if 'одна' in tok:
        print(f'--{tok}--')

--однако--
--однажды--
--производная--
--однакож--
--однажда--
--однажда--
--производная--
--проходная--
--однажды--
--однакоже--
--проходная--
--одна--
--поднадзорный--
--росприроднадзор--
--подначивать--
--продналог--
--поднатореть--
--одна́--
--международная--
--подначальный--
--поднатужиться--
--одна--
--подначка--
--однакож--
--однако--
--поднадоеть--
--одна--
--поднажать--
--роднай--
--боднар--
--родная--
--одна--
--судоходна--


In [None]:
'unknown' in emb.tokens

In [None]:
emb.tokens[:10]