In [None]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.use_jedi=False

In [None]:
# ! /notebook/py39/bin/pip install -U git+https://github.com/tchewik/dis2du.git

In [None]:
import os, sys
import re
import pickle
import glob
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
import xml

from dis2du.read_dis import read_dis
from dis2du.tree import RSTTree
from isanlp.annotation_rst import DiscourseUnit

from tqdm import tqdm
tqdm.pandas()

## Read RS3 files into isanlp.DiscourseUnit annotations
input:
 - corpus with .rs3 files
output:
 - ``corpus/file_filename_PART.du``  - pickled isanlp DiscourseUnit with tree number PART from the original *.rs3 file

In [None]:
%%bash

cd corpora/
rm -r RuRSTreebank_jul22/
unzip -q RuRSTreebank_jul22.zip

#### 1. Split dataset files into separated trees

In [None]:
from utils.dataset.rs3_forest_splitter import RS3ForestSplitter

splitter = RS3ForestSplitter()

output_dir = 'data_ru/corpus_rs3'
! rm -r data_ru
! mkdir data_ru
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
    os.mkdir(os.path.join(output_dir, 'train/'))
    os.mkdir(os.path.join(output_dir, 'dev/'))
    os.mkdir(os.path.join(output_dir, 'test/'))

for part in ('train', 'dev', 'test'):
    for corpus in ('news1', 'news2', 'blogs'):
        for filename in tqdm(glob.glob(os.path.join('corpora', 'RuRSTreebank_jul22', corpus, part, '*.rs3'))):
            splitter(filename, os.path.join(output_dir, part))
    
    for file in glob.glob(os.path.join(output_dir, part, '*.rs3')):
        new_filename = part + '.' + os.path.basename(file)
        with open(os.path.join(output_dir, new_filename), 'w') as f:
            f.write(open(file, 'r').read())
        os.remove(file)
    
for part in ('train', 'dev', 'test'):
    os.rmdir(os.path.join(output_dir, part))

In [None]:
! ls -laht data_ru/corpus_rs3/train.* | wc -l

In [None]:
! ls -laht data_ru/corpus_rs3/dev.* | wc -l

In [None]:
! ls -laht data_ru/corpus_rs3/test.* | wc -l

#### 2. Convert them all to *.dis files

Using https://github.com/rst-workbench/rst-converter-service

In [None]:
from utils.dataset.rst2dis_converter import split_seq, RST2DISConverter

BASE_URL = 'localhost:5000'  # <- put rst converter address here
THREADS = 10
OUTPUT_DIR = 'data_ru/corpus_dis'

if os.path.isdir(OUTPUT_DIR):
    ! rm -r $OUTPUT_DIR
os.mkdir(OUTPUT_DIR)

# (!) Jupyter kernel does not indicate the connection with the multiprocess IO operations
# keep watching on docker if necessary
files = glob.glob(f'data_ru/corpus_rs3/*.rs3')
for batch in split_seq(files, THREADS):
    t = RST2DISConverter(BASE_URL, batch, output_dir=OUTPUT_DIR)
    t.start()

Check overall number of trees

In [None]:
! ls -lath data_ru/corpus_dis/*.dis | wc -l

Replace ##### with other marker because the dis file reader will somehow ommit it

In [None]:
for file in glob.glob(os.path.join('data_ru', 'corpus_dis', '*.dis')):
    with open(file, 'r') as f:
        tree_txt = f.read().replace('##### ', '_NEW_LINE_')
    with open(file, 'w') as f:
        f.write(tree_txt)

In [None]:
pattern = re.compile('_!(.*)_!')

for file in glob.glob(os.path.join('data_ru', 'corpus_dis', '*.dis')):
    with open(file, 'r') as f:
        tree_lines = f.readlines()
    with open(file, 'w') as f:
        for line in tree_lines:
            if not 'IMG' in line:
                f.write(line)
            else:
                f.write(pattern.sub('_!IMG_!', line))

#### 3. Collect DiscourseUnit annotations for isanlp library

output:
 - ``corpus_du/file.du`` - DiscourseUnit tree annotation
 - ``data/file.txt`` - Original text collected directly from the annotation
 - ``data/file.edus``  - Text file with edus from .rs3 - each line contains one edu
 - ``data/all_pairs.fth`` - All the relation pairs from the corpus

In [None]:
! rm -r data_ru/corpus_du

In [None]:
from utils.dataset.dis_file_reading import *

input_dir = 'data_ru/corpus_dis'
output_dir = 'data_ru/corpus_du'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

failed = []
for file in tqdm(glob.glob(os.path.join(input_dir, '*.dis'))):
    # try:
    tree = read_dis(file, force_brackets=False)
    output_file = file.split('/')[-1].replace('.dis', '.du')
    with open(os.path.join(output_dir, output_file), 'wb') as f:
        pickle.dump(tree, f)
    # except:
    #     failed.append(file)

In [None]:
sorted(failed)  # Bugs in the annotation, number corresponds to the tree number

Collect text files and edus.

In [None]:
! rm -r data_ru/data
! mkdir data_ru/data

In [None]:
def extr_edus(tree):
    if tree.relation == 'elementary':
        return [tree.text]
    else:
        tt = []
        tt += extr_edus(tree.left)
        tt += extr_edus(tree.right)
    return tt

all_pairs = []
LINES_DELIM = '_NEW_LINE_'
for orig_filename in glob.glob(os.path.join('data_ru', 'corpus_du', '*part_0.du')):
    text = ''
    all_edus = []
    for du_filename in sorted(glob.glob(orig_filename.replace('_0.du', '_*.du')),
                             key=lambda x: float(re.findall("(\d+)",x)[-1])):
        tree = pickle.load(open(du_filename, 'rb'))
        edus = extr_edus(tree)
        all_edus += edus
        text += ' ' + ' '.join(edus)

    filename = os.path.basename(orig_filename).replace('_part_0.du', '')

    # Write EDUs
    with open(os.path.join('data_ru', 'data', filename + '.edus'), 'w') as f:
        f.write('\n'.join([edu.strip().replace(LINES_DELIM, '') for edu in all_edus]))
        f.write('\n')

    # Write the text
    text = text.replace(LINES_DELIM, '\n')
    with open(os.path.join('data_ru', 'data', filename + '.txt'), 'w') as f:
        f.write(text.strip())

Align trees with the original texts (collect ``start`` and ``end`` for each node).

In [None]:
import os
import pandas as pd


def align_du2text(tree, text, start=None, end=None):
    tree.text = tree.text.replace(LINES_DELIM, '\n').strip()
    
    if start != None:
        tree.start = start
        tree.end = tree.start + len(tree.text)
        
    elif end != None:
        tree.end = end
        tree.start = tree.end - len(tree.text)
        
    if tree.relation != 'elementary':
        tree.left = align_du2text(tree.left, text, start=tree.start)
        tree.right = align_du2text(tree.right, text, end=tree.end)
        
    return tree


def extr_pairs(tree, filename):
    pp = []
    if tree.left:
        pp.append([tree.left.text, tree.right.text,
                   tree.left.start, tree.right.start,
                   tree.relation, tree.nuclearity, filename])
        pp += extr_pairs(tree.left, filename)
        pp += extr_pairs(tree.right, filename)
    return pp

In [None]:
all_pairs = []

In [None]:
for file in tqdm(glob.glob(os.path.join('data_ru', 'data', '*.txt'))):
    text = open(file, 'r').read()
    filename = file.split('/')[-1].replace('.txt', '')

    for du_filename in sorted(glob.glob(os.path.join('data_ru', 'corpus_du', filename + '_part_*'))):
        tree = pickle.load(open(du_filename, 'rb'))
        tree_text = tree.text.strip().replace(LINES_DELIM, '\n').strip()
        start = text.find(tree_text)
        if start == -1:
            print(du_filename)
        tree = align_du2text(tree, text, start=start, end=start + len(tree_text))
        all_pairs += extr_pairs(tree, filename=filename)
        pickle.dump(tree, open(du_filename, 'wb'))

In [None]:
pairs = pd.DataFrame(all_pairs,
                     columns=['snippet_x', 'snippet_y', 'loc_x', 'loc_y', 'category_id', 'order', 'filename'])
pairs.category_id.unique().shape

In [None]:
pairs = pairs.drop_duplicates()
pairs.shape

In [None]:
# Check for the correction. All these operations should return zeros

print(pairs[pairs.loc_x == -1].shape, pairs[pairs.loc_y == -1].shape)  # No relations not found in the text sources
print(pairs[pairs.loc_x > pairs.loc_y].shape)  # No wrong matching with the text sources
print(pairs[pairs.category_id.isna()].shape, pairs[pairs.category_id == 'span'].shape)  # No wrong parsed relation names

In [None]:
labels = pairs.category_id + '_' + pairs.order

In [None]:
labels.value_counts()

In [None]:
from utils.dataset.rename_relations import rename_relations

pairs = rename_relations(pairs)

In [None]:
len(pairs.relation.unique())

In [None]:
pairs.relation.value_counts()

In [None]:
pairs.reset_index().to_feather(os.path.join('data_ru', 'all_pairs.fth'))

## Annotate the texts with isanlp 
output:
 - ``file.annot.pkl``  - Morphosyntactic annotation in isanlp format

In [None]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.processor_razdel import ProcessorRazdel


host_spacy = ''  # <- set the hostname
port_spacy = '3334'  # <- and the port


ppl = PipelineCommon([
    (ProcessorRazdel(), ['text'],
    {'tokens': 'tokens',
     'sentences': 'sentences'}),
    (ProcessorRemote(host_spacy, port_spacy, '0'),
     ['tokens', 'sentences'],
     {'lemma': 'lemma',
      'postag': 'postag',
      'morph': 'morph',
      'syntax_dep_tree': 'syntax_dep_tree',
      'entities': 'entities'})
])

In [None]:
import glob
import os
import pickle
from tqdm.autonotebook import tqdm

for file in tqdm(glob.glob(os.path.join('data_ru', 'data', '*.txt'))):
    text = open(file, 'r').read()
    filename = file.replace('.txt', '.annot.pkl')
    annot = ppl(text)
    pickle.dump(annot, open(filename, 'wb'))

In [None]:
%%sh
ls -laht data_ru/data/*.annot.pkl | wc -l
ls -laht data_ru/data/*.edus | wc -l

#### (Optional) Look at the sentence integrity in the corpus 

In [None]:
from tqdm import tqdm


def get_dus(tree):
    result = [tree.text]
    if tree.left:
        result += get_dus(tree.left)
        result += get_dus(tree.right)
    return result

def get_sentences_and_dus(filename):
    annot = pickle.load(open(filename, 'rb'))
    docname = os.path.basename(filename).replace('.annot.pkl', '')
    
    # Collect discourse units as texts
    dus = []
    for i in range(100):
        new_filename = os.path.join('data_ru', 'corpus_du', f'{docname}_part_{i}.du')
        if not os.path.isfile(new_filename):
            # print(new_filename)
            continue
        tree = pickle.load(open(new_filename, 'rb'))
        dus += get_dus(tree)

    dus_chr = [''.join(text.split()) for text in dus]
    
    # Collect sentences as texts
    sentences = [''.join([token.text for token in annot['tokens'][sent.begin:sent.end]]) for sent in annot['sentences']]
    
    return sentences, dus_chr

In [None]:
sentences, dus = [], []
for filename in tqdm(glob.glob('data_ru/data/*.annot.pkl')):
    snt, chrdus = get_sentences_and_dus(filename)
    sentences += snt
    dus += chrdus

results = sum([sentence in dus for sentence in tqdm(sentences)])

In [None]:
results / len(sentences)

## Gold trees
### Extract features 
output:
 - ``models/tf_idf/pipeline.pkl``  - Is used in default feature extraction
 - ``data_ru/file.gold.pkl``  - Dataset with extracted default features for gold trees

#### 1. Load sentiment models, install dependencies

In [None]:
# %%bash

# source /notebook/py39/bin/activate
# # python -c "import nltk; nltk.download('stopwords')"
# # pip install dostoevsky
# # dostoevsky download fasttext-social-network-model
# pip install textblob tensorflow tensorflow_hub tensorflow_text

#### 2. Feature extraction

In [None]:
import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../')
from features_processors import FeaturesProcessor

features_processor = FeaturesProcessor(language='ru', verbose=0, use_use=True, use_sentiment=True)

In [None]:
import glob
import pandas as pd
import pickle
import os
from tqdm.autonotebook import tqdm

table = pd.read_feather(os.path.join('data_ru', 'all_pairs.fth'))

In [None]:
for filename, df in tqdm(table.groupby('filename')):
    annot = pickle.load(open(os.path.join('data_ru', 'data', filename + '.annot.pkl'), 'rb'))
    features = features_processor(df,
                                  annot['text'], annot['tokens'],
                                  annot['sentences'], annot['lemma'],
                                  annot['morph'], annot['postag'],
                                  annot['syntax_dep_tree'],)
    del features['level_0']
    features.to_pickle(os.path.join('data_ru', 'data', filename + '.gold.pkl'))