In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# ! pip install -U git+https://github.com/tchewik/dis2du.git

In [None]:
import os, sys
import re
import pickle
import glob
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
import xml

from dis2du.read_dis import read_dis
from dis2du.tree import RSTTree
from isanlp.annotation_rst import DiscourseUnit

from tqdm import tqdm
tqdm.pandas()

## Read RS3 files into isanlp.DiscourseUnit annotations
input:
 - corpus with .rs3 files
output:
 - ``corpus/file_filename_PART.du``  - pickled isanlp DiscourseUnit with tree number PART from the original *.rs3 file

In [None]:
# %%bash

# cd corpus/
# unzip RuRSTreebank_7.zip

#### 1. Split dataset files into separated trees

In [None]:
from utils.dataset.rs3_forest_splitter import RS3ForestSplitter

splitter = RS3ForestSplitter()

output_dir = 'corpus_rs3'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

for filename in tqdm(glob.glob('corpus/RuRSTreebank_7/news1_rs3/*.rs3')):
    splitter(filename, output_dir)

for filename in tqdm(glob.glob('corpus/RuRSTreebank_7/news2_rs3/*.rs3')):
    splitter(filename, output_dir)

for filename in tqdm(glob.glob('corpus/RuRSTreebank_7/blogs_rs3/*.rs3')):
    splitter(filename, output_dir)

In [None]:
! ls -laht $output_dir/*.rs3 | wc -l  # Overall number of trees in news+blogs

#### 2. Convert them all to *.dis files

Using https://github.com/rst-workbench/rst-converter-service

In [None]:
from utils.dataset.rst2dis_converter import split_seq, RST2DISConverter

BASE_URL = 'localhost:5000'  # <- put rst converter address here
THREADS = 10
OUTPUT_DIR = 'corpus_dis'

if os.path.isdir(OUTPUT_DIR):
    ! rm -r $OUTPUT_DIR
os.mkdir(OUTPUT_DIR)

# (!) Jupyter kernel does not indicate the connection with the multiprocess IO operations
# keep watching on docker if necessary
files = glob.glob(f'corpus_rs3/*.rs3')
for batch in split_seq(files, THREADS):
    t = RST2DISConverter(BASE_URL, batch, output_dir=OUTPUT_DIR)
    t.start()

Check overall number of trees

In [None]:
! ls -lath corpus_dis/*.dis | wc -l

Replace ##### with other marker because the dis file reader will somehow ommit it

In [None]:
for file in glob.glob(os.path.join('corpus_dis', '*.dis')):
    with open(file, 'r') as f:
        tree_txt = f.read().replace('##### ', '_NEW_LINE_')
    with open(file, 'w') as f:
        f.write(tree_txt)

#### 3. Collect DiscourseUnit annotations for isanlp library

output:
 - ``corpus_du/file.du`` - DiscourseUnit tree annotation
 - ``data/file.txt`` - Original text collected directly from the annotation
 - ``data/file.edus``  - Text file with edus from .rs3 - each line contains one edu
 - ``data/all_pairs.fth`` - All the relation pairs from the corpus

In [None]:
! rm -r corpus_du

In [None]:
from utils.dataset.dis_file_reading import *

input_dir = 'corpus_dis'
output_dir = 'corpus_du'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

failed = []
for file in tqdm(glob.glob(os.path.join(input_dir, '*.dis'))):

    try:
        tree = read_dis(file)
        output_file = file.split('/')[-1].replace('.dis', '.du')
        with open(os.path.join(output_dir, output_file), 'wb') as f:
            pickle.dump(tree, f)
    except Exception as e:
        print(e)
        failed.append(file)

In [None]:
sorted(failed)  # Bugs in the annotation, number corresponds to the tree number

Collect text files and edus.

In [None]:
! rm -r data_ru && mkdir data_ru

In [None]:
def extr_edus(tree):
    if tree.relation == 'elementary':
        return [tree.text]
    else:
        tt = []
        tt += extr_edus(tree.left)
        tt += extr_edus(tree.right)
    return tt

all_pairs = []
LINES_DELIM = '_NEW_LINE_'
for orig_filename in glob.glob(os.path.join('corpus_du/', '*part_0.du')):
    text = ''
    all_edus = []
    for du_filename in sorted(glob.glob(orig_filename.replace('_0.du', '_*.du')),
                             key=lambda x: float(re.findall("(\d+)",x)[-1])):
        tree = pickle.load(open(du_filename, 'rb'))
        edus = extr_edus(tree)
        all_edus += edus
        text += ' ' + ' '.join(edus)

    filename = orig_filename.replace('corpus_du/', '').replace('_part_0.du', '')

    # Write EDUs
    with open(os.path.join('data_ru/', filename + '.edus'), 'w') as f:
        f.write('\n'.join([edu.strip().replace(LINES_DELIM, '') for edu in all_edus]))
        f.write('\n')

    # Write the text
    text = text.replace(LINES_DELIM, '\n')
    with open(os.path.join('data_ru/', filename + '.txt'), 'w') as f:
        f.write(text.strip())

Align trees with the original texts (collect ``start`` and ``end`` for each node).

In [None]:
import os
import pandas as pd


def extr_pairs(tree, filename):
    pp = []
    if tree.left:
        pp.append([tree.left.text, tree.right.text,
                   tree.left.start, tree.right.start,
                   tree.relation, tree.nuclearity, filename])
        pp += extr_pairs(tree.left, filename)
        pp += extr_pairs(tree.right, filename)
    return pp

def align_du2text(tree, text, begin=0):
    tree.text = tree.text.replace(LINES_DELIM, '\n').strip()
    tree.start = text.find(tree.text, begin)
    tree.end = tree.start + len(tree.text)
    if tree.relation != 'elementary':
        tree.left = align_du2text(tree.left, text)
        tree.right = align_du2text(tree.right, text, tree.left.end)
    return tree

In [None]:
all_pairs = []

In [None]:
for file in tqdm(glob.glob('data_ru/*.txt')):
    text = open(file, 'r').read()
    filename = file.split('/')[-1].replace('.txt', '')

    for du_filename in sorted(glob.glob(os.path.join('corpus_du/', filename + '_part_*'))):
        tree = pickle.load(open(du_filename, 'rb'))
        tree = align_du2text(tree, text)
        all_pairs += extr_pairs(tree, filename=filename)
        pickle.dump(tree, open(du_filename, 'wb'))

In [None]:
pairs = pd.DataFrame(all_pairs,
                     columns=['snippet_x', 'snippet_y', 'loc_x', 'loc_y', 'category_id', 'order', 'filename'])
pairs.category_id.unique().shape

In [None]:
pairs = pairs.drop_duplicates()
pairs.shape

In [None]:
# Check for the correction. All these operations should return zeros

print(pairs[pairs.loc_x == -1].shape, pairs[pairs.loc_y == -1].shape)  # No relations not found in the text sources
print(pairs[pairs.loc_x > pairs.loc_y].shape)  # No wrong matching with the text sources
print(pairs[pairs.category_id.isna()].shape, pairs[pairs.category_id == 'span'].shape)  # No wrong parsed relation names

In [None]:
labels = pairs.category_id + '_' + pairs.order

In [None]:
from utils.dataset.rename_relations import rename_relations

pairs = rename_relations(pairs)

In [None]:
len(pairs.relation.unique())

In [None]:
pairs.reset_index().to_feather('data_ru/all_pairs.fth')

## Annotate the texts with isanlp 
output:
 - ``file.annot.pkl``  - Morphosyntactic annotation in isanlp format

In [None]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp.processor_razdel import ProcessorRazdel


host_udpipe = ''  # <- set the hostname
port_udpipe = '3134'  # <- and the port


ppl = PipelineCommon([
    (ProcessorRazdel(), ['text'],
    {'tokens': 'tokens',
     'sentences': 'sentences'}),
    (ProcessorRemote(host_udpipe, port_udpipe, '0'),
     ['tokens', 'sentences'],
     {'lemma': 'lemma',
      'syntax_dep_tree': 'syntax_dep_tree',
      'postag': 'ud_postag'}),
    (ProcessorMystem(delay_init=False),
     ['tokens', 'sentences'],
     {'postag': 'postag'}),
    (ConverterMystemToUd(),
     ['postag'],
     {'morph': 'morph',
      'postag': 'postag'}),
])

In [None]:
import glob
import os
import pickle
from tqdm.autonotebook import tqdm

for file in tqdm(glob.glob(f'data_ru/*.txt')):
    text = open(file, 'r').read()
    filename = file.replace('.txt', '.annot.pkl')
    annot = ppl(text)
    pickle.dump(annot, open(filename, 'wb'))

In [None]:
%%sh
ls -laht data_ru/*.pkl | wc -l
ls -laht data_ru/*.edus | wc -l

## Gold trees
### Extract features 
output:
 - ``models/tf_idf/pipeline.pkl``  - Is used in default feature extraction
 - ``data_ru/file.gold.pkl``  - Dataset with extracted default features for gold trees

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

input_dir = 'data_ru/'

if not os.path.isdir('../../models'):
    os.mkdir('../../models')

if not os.path.isdir('../../models/tf_idf'):
    os.mkdir('../../models/tf_idf')

corpus = []
for file in glob.glob(os.path.join(input_dir, f"*.annot.pkl")):
    tokens = pickle.load(open(file, 'rb'))['tokens']
    corpus.append(list(map(lambda token: token.text.lower(), tokens)))

from utils.count_vectorizer import MyCountVectorizer

count_vect = MyCountVectorizer(ngram_range=(1, 2), tokenizer=MyCountVectorizer.dummy,
                               preprocessor=MyCountVectorizer.dummy)

svd = TruncatedSVD(n_components=25,
                   tol=0.0,
                   n_iter=7,
                   random_state=42)

pipeline = Pipeline([
    ('vect', count_vect),
    ('svd', svd)
])

pipeline.fit(corpus)
pickle.dump(pipeline, open('../../models/tf_idf/pipeline.pkl', 'wb'))

In [None]:
# %%bash

# python -c "import nltk; nltk.download('stopwords')"
# pip install dostoevsky
# dostoevsky download fasttext-social-network-model

In [None]:
import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../')

from features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='../../models', verbose=0)

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm

table = pd.read_feather('data_ru/all_pairs.fth')
table = table[table.snippet_x.map(len) > 0]
table = table[table.snippet_y.map(len) > 0]

In [None]:
for filename, df in tqdm(table.groupby('filename')):
    annot = pickle.load(open(os.path.join('data_ru', filename + '.annot.pkl'), 'rb'))
    features = features_processor(df,
                                  annot['text'], annot['tokens'],
                                  annot['sentences'], annot['lemma'],
                                  annot['morph'], annot['ud_postag'],
                                  annot['syntax_dep_tree'])
    features.to_pickle(os.path.join('data_ru', filename + '.gold.pkl'))

In [None]:
for pklfile in tqdm(glob.glob('data_ru/*.gold.pkl')):
    features = pd.read_pickle(pklfile)
    if 'level_0' in features.keys():
        features = features.drop(columns=['level_0'])
    features = rename_relations(features)
    features.to_pickle(pklfile)