In [None]:
%load_ext autoreload
%autoreload 2

## RS3 parsing 
output:
 - file.edus  # text file with edus from .rs3 - each line contains one edu
 - file.json  # json file with du-pairs from gold trees. keys: ['snippet_x', 'snippet_y', 'category_id']

In [None]:
! mkdir data

In [None]:
! python utils/parse_rs3.py corpus/RuRsTreebank_full/blogs/blogs_rs3/* > rst_blogs_parsing.log
! python utils/parse_rs3.py corpus/RuRsTreebank_full/news1/news1_rs3/* > rst_news1_parsing.log
! python utils/parse_rs3.py corpus/RuRsTreebank_full/news2/news2_rs3/* > rst_news2_parsing.log
! python utils/parse_rs3.py corpus/RuRsTreebank_full/sci_comp/sci_comp_rs3/* > rst_scicomp_parsing.log
! python utils/parse_rs3.py corpus/RuRsTreebank_full/sci_ling/sci_ling_rs3/* > rst_sciling_parsing.log

In [None]:
! cat rst_sciling_parsing.log

In [None]:
! wc -l data/*.edus | grep 'total'

## Annotate the texts with isanlp 
output:
 - file.annot.pkl  # morphology, syntax, semantics to use with isanlp

In [None]:
%%bash

pip install git+https://github.com/IINemo/isanlp.git@discourse
pip install git+https://github.com/tchewik/isanlp_srl_framebank.git

In [None]:
host = ''
host3 = ''

In [None]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd

ppl = PipelineCommon([(ProcessorRemote(host, 4333, 'default'),
                       ['text'],
                       {'sentences' : 'sentences', 
                        'tokens' : 'tokens',
                        'postag' : 'postag',
                        'lemma' : 'lemma'}),
                      (ConverterMystemToUd(), 
                        ['postag'],
                        {'morph' : 'morph',
                         'postag': 'postag'}),
                      (ProcessorRemote(host, 5336, '0'), 
                        ['tokens', 'sentences'], 
                        {'syntax_dep_tree' : 'syntax_dep_tree',
                         'postag' : 'ud_postag'}),
#                       (ProcessorRemote(host3, 4336, 'default'),
#                         ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
#                         {'srl' : 'srl'})
                     ])

In [None]:
from utils.file_reading import read_edus, read_gold, read_annotation, prepare_text

In [None]:
import glob
import pickle
import os
import sys
from tqdm.notebook import tqdm

directories = ['corpus/RuRsTreebank_full/sci_comp/sci_comp_txt/',
                'corpus/RuRsTreebank_full/sci_ling/sci_ling_txt/',
                'corpus/RuRsTreebank_full/blogs/blogs_txt/blogs_46',
                'corpus/RuRsTreebank_full/news1/news1_txt/',
                'corpus/RuRsTreebank_full/news2/news2_txt/'
]

for path in directories:
    print('analyze path:', path)
    for file in tqdm(glob.glob(f'{path}*.txt')):
        text = prepare_text(open(file, 'r').read())
        annot = ppl(text)
        filename = file.split('/')[-1].replace('.txt', '.annot.pkl')
        pickle.dump(annot, open(os.path.join('data', filename), 'wb'))

## Gold trees
### Extract features 
output:
 - models/tf_idf/pipeline.pkl  # is used in default feature extraction
 - file.gold.pkl  # dataset with extracted default features for gold trees

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import glob
import pickle
import numpy as np
import pandas as pd
import nltk

from utils.file_reading import read_annotation

IN_PATH = 'data/'
! mkdir models
! mkdir models/tf_idf

corpus = []
for file in glob.glob("%s*.json" % IN_PATH):
    tokens = read_annotation(file.replace('.json', ''))['tokens']
    corpus.append(list(map(lambda token: token.text.lower(), tokens)))

from sklearn.feature_extraction.text import CountVectorizer

def dummy(text):
    return text

count_vect = CountVectorizer(ngram_range=(1, 2), tokenizer=dummy, preprocessor=dummy)

svd = TruncatedSVD(n_components=25,
                   tol=0.0,
                   n_iter=7,
                   random_state=42)

pipeline = Pipeline([
    ('vect', count_vect),
    ('svd', svd)
])

pipeline.fit(corpus)
pickle.dump(pipeline, open('models/tf_idf/pipeline.pkl', 'wb'))

In [None]:
%%bash

python -c "import nltk; nltk.download('stopwords')"
pip install dostoevsky
dostoevsky download fasttext-social-network-model

In [None]:
from utils.features_processor_default import FeaturesProcessor

def dummy(x):
    return x
    
features_processor = FeaturesProcessor(model_dir_path='models', verbose=False)

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm
from utils.file_reading import read_gold, read_annotation


IN_PATH = 'data/'
for file in tqdm(glob.glob("%s*.json" % IN_PATH)):
    table = read_gold(file.replace('.json', ''))#pd.read_json(file)
    table = table[table.snippet_x.map(len) > 0]
    annot = read_annotation(file.replace('.json', ''))#pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    features = features_processor(table, 
                                  annot['text'], annot['tokens'], 
                                  annot['sentences'], annot['lemma'], 
                                  annot['morph'], annot['postag'], 
                                  annot['syntax_dep_tree'])
    features.to_pickle(file.replace('.json', '.gold.pkl'))

UDPipe assigns different postags.

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm

IN_PATH = 'data/'
avail_pairs = []
for file in tqdm(glob.glob("%s*.json" % IN_PATH)):
    annot = pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    for sent in annot['ud_postag']:
        avail_pairs.append('_'.join(sent[:2]))

In [None]:
set(avail_pairs)

Analyze roles.

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm

IN_PATH = 'data/'
avail_pairs = []
for file in tqdm(glob.glob("%s*.json" % IN_PATH)):
    annot = pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    for sent in annot['srl']:
        for pred in sent:
            for event in pred:
                print(event)
                break