## RS3 parsing 
output:
 - file.edus  # text file with edus from .rs3 - each line contains one edu
 - file.json  # json file with du-pairs from gold trees. keys: ['snippet_x', 'snippet_y', 'category_id']

In [None]:
! mkdir data

In [None]:
! python utils/parse_rs3.py corpus/news_texts/news_rs3/* > rst_news_parsing.log
! python utils/parse_rs3.py corpus/science_texts/compscience/compscience_rs3/* > rst_comp_parsing.log
! python utils/parse_rs3.py corpus/science_texts/linguistics/linguistics_rs3/* > rst_ling_parsing.log

In [None]:
! python utils/parse_rs3.py corpus/RuRsTreebank_full/blogs/blogs_rs3/* > rst_blogs_parsing.log
! python utils/parse_rs3.py corpus/RuRsTreebank_full/news1/news1_rs3/* > rst_news1_parsing.log
! python utils/parse_rs3.py corpus/RuRsTreebank_full/news2/news2_rs3/* > rst_news2_parsing.log
! python utils/parse_rs3.py corpus/RuRsTreebank_full/sci_comp/sci_comp_rs3/* > rst_scicomp_parsing.log
! python utils/parse_rs3.py corpus/RuRsTreebank_full/sci_ling/sci_ling_rs3/* > rst_sciling_parsing.log

In [None]:
! wc -l data/*.edus | grep 'total'

In [None]:
import pandas as pd

text_html_map = {
    r'\n': r' ',
    r'&gt;': r'>',
    r'&lt;': r'<',
    r'&amp;': r'&',
    r'&quot;': r'"',
    r'&ndash;': r'–',
    r'##### ': r'',
    r'\\\\\\\\': r'\\',
    r'  ': r' ',
    r'——': r'-',
    r'—': r'-',
    r'/': r'',
    r'\^': r'',
    r'^': r'',
    r'±': r'+',
    r'y': r'у',
    r'x': r'х'
}

def read_edus(filename):
    edus = []
    with open(filename + '.edus', 'r') as f:
        for line in f.readlines():
            edu = str(line.strip())
            for key, value in text_html_map.items():
                edu = edu.replace(key, value)
            edus.append(edu)
    return edus

def read_gold(filename):
    df = pd.read_pickle(filename + '.gold.pkl')
    for key in text_html_map.keys():
        df['snippet_x'].replace(key, text_html_map[key], regex=True, inplace=True)
        df['snippet_y'].replace(key, text_html_map[key], regex=True, inplace=True)

    return df

def read_json(filename):
    df = pd.read_json(filename + '.json')
    for key in text_html_map.keys():
        df['snippet_x'].replace(key, text_html_map[key], regex=True, inplace=True)
        df['snippet_y'].replace(key, text_html_map[key], regex=True, inplace=True)

    return df

def read_annotation(filename):
    annot = pd.read_pickle(filename + '.annot.pkl')
    for key in text_html_map.keys():
        annot['text'] = annot['text'].replace(key, text_html_map[key])
        for token in annot['tokens']:
            token.text = token.text.replace(key, text_html_map[key])
    
    return annot

## Annotate the texts with isanlp 
output:
 - file.annot.pkl  # morphology, syntax, semantics to use with isanlp

In [None]:
%%bash

pip install git+https://github.com/IINemo/isanlp.git@dev
pip install git+https://github.com/tchewik/isanlp_srl_framebank.git

In [None]:
host = ''
host3 = ''

In [None]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd

ppl = PipelineCommon([(ProcessorRemote(host, 4333, 'default'),
                       ['text'],
                       {'sentences' : 'sentences', 
                        'tokens' : 'tokens',
                        'postag' : 'mystem_postags',
                        'lemma' : 'lemma'}),
                      (ConverterMystemToUd(), 
                        ['mystem_postags'],
                        {'morph' : 'morph',
                         'postag': 'postag'}),
                      (ProcessorRemote(host, 5336, '0'), 
                        ['tokens', 'sentences'], 
                        {'syntax_dep_tree' : 'syntax_dep_tree',
                         'postag' : 'ud_postag'}),
                      (ProcessorRemote(host3, 4336, 'default'),
                        ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
                        {'srl' : 'srl'})])

In [None]:
import glob
import pickle
import os
import sys
from tqdm import tqdm_notebook as tqdm

directories = ['corpus/news_texts/news_txt/',
               'corpus/science_texts/coвmpscience/compscience_txt/',
               'corpus/science_texts/linguistics/linguistics_txt/']

directories = ['corpus/RuRsTreebank_full/sci_comp/sci_comp_txt/',
                'corpus/RuRsTreebank_full/sci_ling/sci_ling_txt/',
                'corpus/RuRsTreebank_full/blogs/blogs_txt/',
                'corpus/RuRsTreebank_full/news1/news1_txt/',
                'corpus/RuRsTreebank_full/news2/news2_txt/']

def prepare_text(text):
    text = text.replace('  \n', '#####')
    text = text.replace(' \n', '#####')
    text = text + '#####'
    text = text.replace('#####', '\n')
    text_html_map = {
        '\n': r' ',
        '&gt;': r'>',
        '&lt;': r'<',
        '&amp;': r'&',
        '&quot;': r'"',
        '&ndash;': r'–',
        '##### ': r'',
        '\\\\\\\\': r'\\',
        '   ': r' ',
        '  ': r' ',
        '——': r'-',
        '—': r'-',
        '/': r'',
        '\^': r'',
        '^': r'',
        '±': r'+',
        'y': r'у',
        'xc': r'хс',
        'x': r'х'
    }
    for key in text_html_map.keys():
        text = text.replace(key, text_html_map[key])
    return text    

for path in directories:
    print('>>', path)
    for file in tqdm(glob.glob(f'{path}*.txt')):
        text = prepare_text(open(file, 'r').read())
        annot = ppl(text)
        filename = file.split('/')[-1].replace('.txt', '.annot.pkl')
        pickle.dump(annot, open(os.path.join('data', filename), 'wb'))


## Gold trees
### Extract features 
output:
 - models/tf_idf/pipeline.pkl  # is used in default feature extraction
 - file.gold.pkl  # dataset with extracted default features for gold trees

In [None]:
annot.keys()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import glob
import pickle
import numpy as np
import pandas as pd
import nltk

IN_PATH = 'data/'
! mkdir models
! mkdir models/tf_idf

corpus = []
for file in glob.glob("%s*.json" % IN_PATH):
    table = pd.read_json(file)
    #annot = pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    #for sentence in annot['sentences']:
    #    corpus.append(' '.join([token.text for token in annot['tokens'][sentence.begin:sentence.end]]))
    for snippet in table.snippet_x.values:
        corpus.append(' '.join(nltk.tokenize.casual_tokenize(snippet)))

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1, 2))

#tf_idf_vectorizer = TfidfVectorizer(sublinear_tf=False, norm='l2', analyzer='word',
#                                    ngram_range=(2, 3), use_idf=1, smooth_idf=1)

svd = TruncatedSVD(n_components=25,
                   tol=0.0,
                   n_iter=7,
                   random_state=42)

pipeline = Pipeline([
    ('vect', count_vect),
    ('svd', svd)
])

pipeline.fit(corpus)
pickle.dump(pipeline, open('models/tf_idf/pipeline.pkl', 'wb'))

In [None]:
%%bash

python -c "import nltk; nltk.download('stopwords')"
pip install dostoevsky
dostoevsky download fasttext-social-network-model

In [None]:
from utils.features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='models', verbose=False)

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm

IN_PATH = 'data/'
for file in tqdm(glob.glob("%s*.json" % IN_PATH)):
    table = pd.read_json(file)
    table = table[table.snippet_x.map(len) > 0]
    annot = pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    features = features_processor(table, 
                                  annot['text'], annot['tokens'], 
                                  annot['sentences'], annot['lemma'], 
                                  annot['morph'], annot['postag'], 
                                  annot['syntax_dep_tree'])
    features.to_pickle(file.replace('.json', '.gold.pkl'))

In [None]:
features.head()

In [None]:
features.sm_x_positive.describe()

In [None]:
vars(annot['srl'][0][0])

In [None]:
annot['ud_postag']

UDPipe assigns different postags.

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm

IN_PATH = 'data/'
avail_pairs = []
for file in tqdm(glob.glob("%s*.json" % IN_PATH)):
    annot = pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    for sent in annot['ud_postag']:
        avail_pairs.append('_'.join(sent[:2]))

In [None]:
set(avail_pairs)

Analyze roles.

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm

IN_PATH = 'data/'
avail_pairs = []
for file in tqdm(glob.glob("%s*.json" % IN_PATH)):
    annot = pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    for sent in annot['srl']:
        for pred in sent:
            for event in pred:
                print(event)
                break