## RS3 parsing 
output:
 - file.edus  # text file with edus from .rs3 - each line contains one edu
 - file.json  # json file with du-pairs from gold trees. keys: ['snippet_x', 'snippet_y', 'category_id']

In [1]:
! mkdir data

In [2]:
! python utils/parse_rs3.py corpus/news_texts/news_rs3/* > rst_news_parsing.log
! python utils/parse_rs3.py corpus/science_texts/compscience/compscience_rs3/* > rst_comp_parsing.log
! python utils/parse_rs3.py corpus/science_texts/linguistics/linguistics_rs3/* > rst_ling_parsing.log

In [3]:
! ls -laht data | head

total 34M
drwxr-xr-x 2 root root  12K Nov 22 16:44 .
-rw-r--r-- 1 root root  21K Nov 22 16:44 ling_19.edus
-rw-r--r-- 1 root root 145K Nov 22 16:44 ling_19.json
-rw-r--r-- 1 root root  27K Nov 22 16:44 ling_36.edus
-rw-r--r-- 1 root root 264K Nov 22 16:44 ling_36.json
-rw-r--r-- 1 root root  22K Nov 22 16:44 ling_6.edus
-rw-r--r-- 1 root root 162K Nov 22 16:44 ling_6.json
-rw-r--r-- 1 root root  16K Nov 22 16:44 ling_34.edus
-rw-r--r-- 1 root root 114K Nov 22 16:44 ling_34.json
ls: write error: Broken pipe


In [5]:
! wc -l data/*.edus | grep 'total'

  19917 total


## Annotate the texts with isanlp 
output:
 - file.annot.pkl  # morphology, syntax, semantics to use with isanlp

In [None]:
! pip install git+https://github.com/IINemo/isanlp.git@dev

In [None]:
! pip install git+https://github.com/tchewik/isanlp_srl_framebank.git

In [9]:
host = ''
host3 = ''

In [10]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd

ppl = PipelineCommon([(ProcessorRemote(host, 4333, 'default'),
                       ['text'],
                       {'sentences' : 'sentences', 
                        'tokens' : 'tokens',
                        'postag' : 'mystem_postags',
                        'lemma' : 'lemma'}),
                      (ConverterMystemToUd(), 
                        ['mystem_postags'],
                        {'morph' : 'morph',
                        'postag' : 'postag'}),
                      (ProcessorRemote(host, 5336, '0'), 
                        ['tokens', 'sentences'], 
                        {'syntax_dep_tree' : 'syntax_dep_tree'}),
                      (ProcessorRemote(host3, 4335, 'default'),
                        ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
                        {'srl' : 'srl'})])

In [None]:
import glob
import pickle
import os
import sys
from tqdm import tqdm_notebook as tqdm

directories = ['corpus/news_texts/news_txt/',
               'corpus/science_texts/compscience/compscience_txt/',
               'corpus/science_texts/linguistics/linguistics_txt/']

def prepare_text(text):
    text = text.replace('  \n', '#####')
    text = text.replace(' \n', '#####')
    text = text + '#####'
    text = text.replace('#####', '\n')
    text_html_map = {
        '\n': r' ',
        '&gt;': r'>',
        '&lt;': r'<',
        '&amp;': r'&',
        '&quot;': r'"',
        '&ndash;': r'–',
        '##### ': r'',
        '\\\\\\\\': r'\\',
        '   ': r' ',
        '  ': r' ',
        '——': r'-',
        '—': r'-',
        '/': r'',
        '\^': r'',
        '^': r'',
        '±': r'+',
        'y': r'у',
        'xc': r'хс',
        'x': r'х'
    }
    for key in text_html_map.keys():
        text = text.replace(key, text_html_map[key])
    return text    

for path in directories:
    print('>>', path)
    for file in tqdm(glob.glob(f'{path}*.txt')):
        text = prepare_text(open(file, 'r').read())
        annot = ppl(text)
        filename = file.split('/')[-1].replace('.txt', '.annot.pkl')
        pickle.dump(annot, open(os.path.join('data', filename), 'wb'))


## Gold trees
### Extract features 
output:
 - models/tf_idf/pipeline.pkl  # is used in default feature extraction
 - file.gold.pkl  # dataset with extracted default features for gold trees

In [18]:
annot.keys()

dict_keys(['text', 'sentences', 'tokens', 'mystem_postags', 'lemma', 'morph', 'postag', 'syntax_dep_tree', 'srl'])

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import glob
import pickle
import numpy as np
import pandas as pd

IN_PATH = 'data/'
! mkdir models
! mkdir models/tf_idf

corpus = []
for file in glob.glob("%s*.json" % IN_PATH):
    table = pd.read_json(file)
    annot = pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    #for sentence in annot['sentences']:
    #    corpus.append(' '.join([token.text for token in annot['tokens'][sentence.begin:sentence.end]]))
    for sentence in annot['lemma']:
        corpus.append(' '.join(sentence))

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1, 3))

#tf_idf_vectorizer = TfidfVectorizer(sublinear_tf=False, norm='l2', analyzer='word',
#                                    ngram_range=(2, 3), use_idf=1, smooth_idf=1)

svd = TruncatedSVD(n_components=300,
                   tol=0.0,
                   n_iter=7,
                   random_state=42)

pipeline = Pipeline([
    ('vect', count_vect),
    ('svd', svd)
])

pipeline.fit(corpus)
pickle.dump(pipeline, open('models/tf_idf/pipeline.pkl', 'wb'))

mkdir: cannot create directory ‘models’: File exists
mkdir: cannot create directory ‘models/tf_idf’: File exists


In [26]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [27]:
from utils.features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='models', verbose=True)

Processor initialization...	[DONE]


In [30]:
import glob
import pandas as pd
import pickle

IN_PATH = 'data/'
for file in glob.glob("%s*.json" % IN_PATH):
    table = pd.read_json(file)
    table = table[table.snippet_x.map(len) > 0]
    annot = pickle.load(open(file.replace('.json', '.annot.pkl'), 'rb'))
    features = features_processor(table, 
                                  annot['text'], annot['tokens'], 
                                  annot['sentences'], annot['lemma'], 
                                  annot['morph'], annot['postag'], 
                                  annot['syntax_dep_tree'])
    features.to_pickle(file.replace('.json', '.gold.pkl'))

1	0.2521805763244629
2	0.01662445068359375
3	0.014444828033447266
4	0.018341779708862305
5	0.005433797836303711
6	0.5700323581695557
7	0.03795194625854492
8	0.04698586463928223
9	0.538733720779419
10	0.050879716873168945
11	0.057419776916503906
12	0.1179969310760498
[DONE]
estimated time: 1.7280683517456055
1	0.29721570014953613
2	0.015913009643554688
3	0.014619588851928711
4	0.02109694480895996
5	0.007283926010131836
6	0.5742106437683105
7	0.03778219223022461
8	0.051985740661621094
9	0.5362906455993652
10	0.05201125144958496
11	0.06919693946838379
12	0.14900517463684082
[DONE]
estimated time: 1.827531337738037
1	0.2695019245147705
2	0.0169985294342041
3	0.014343500137329102
4	0.017215251922607422
5	0.005011558532714844
6	0.6037766933441162
7	0.03932833671569824
8	0.04305100440979004
9	0.5308477878570557
10	0.05165290832519531
11	0.05371356010437012
12	0.1120755672454834
[DONE]
estimated time: 1.7584919929504395
1	0.17884159088134766
2	0.013361692428588867
3	0.012866973876953125
4	0.01

6	0.6131432056427002
7	0.039991140365600586
8	0.04297780990600586
9	0.5392050743103027
10	0.052526235580444336
11	0.05319809913635254
12	0.10337090492248535
[DONE]
estimated time: 1.7101969718933105
1	0.11378622055053711
2	0.008322477340698242
3	0.010262012481689453
4	0.007472991943359375
5	0.0027239322662353516
6	0.18724298477172852
7	0.016278982162475586
8	0.027748584747314453
9	0.5298564434051514
10	0.025132179260253906
11	0.02436065673828125
12	0.03262138366699219
[DONE]
estimated time: 0.9865677356719971
1	0.3909187316894531
2	0.019529342651367188
3	0.015750408172607422
4	0.022962093353271484
5	0.007369279861450195
6	0.7482731342315674
7	0.04709887504577637
8	0.05776190757751465
9	0.5466253757476807
10	0.07035088539123535
11	0.08414363861083984
12	0.17756247520446777
[DONE]
estimated time: 2.189704179763794
1	0.1306908130645752
2	0.009842872619628906
3	0.011109590530395508
4	0.009337425231933594
5	0.003207683563232422
6	0.2811110019683838
7	0.021190404891967773
8	0.030413866043090

10	0.0289459228515625
11	0.02856588363647461
12	0.042324066162109375
[DONE]
estimated time: 1.0876872539520264
1	0.12381339073181152
2	0.009268522262573242
3	0.010551691055297852
4	0.008180856704711914
5	0.0028679370880126953
6	0.22586750984191895
7	0.01821422576904297
8	0.029459714889526367
9	0.5298125743865967
10	0.027254819869995117
11	0.026580095291137695
12	0.03784489631652832
[DONE]
estimated time: 1.0506179332733154
1	0.18378305435180664
2	0.012021303176879883
3	0.01259160041809082
4	0.013806581497192383
5	0.004974842071533203
6	0.40838050842285156
7	0.028316259384155273
8	0.04198288917541504
9	0.5402851104736328
10	0.040778160095214844
11	0.04855656623840332
12	0.0924832820892334
[DONE]
estimated time: 1.4287919998168945
1	0.14061999320983887
2	0.00952911376953125
3	0.011125326156616211
4	0.009572744369506836
5	0.003253459930419922
6	0.28440022468566895
7	0.021322011947631836
8	0.03127002716064453
9	0.523406982421875
10	0.030508995056152344
11	0.030387401580810547
12	0.04758739

1	0.11879825592041016
2	0.009186267852783203
3	0.010688066482543945
4	0.008394479751586914
5	0.002945423126220703
6	0.22803544998168945
7	0.01836371421813965
8	0.029293537139892578
9	0.5303668975830078
10	0.027765512466430664
11	0.02714061737060547
12	0.04000377655029297
[DONE]
estimated time: 1.0518295764923096
1	0.2295842170715332
2	0.014643669128417969
3	0.013236045837402344
4	0.015269994735717773
5	0.004437923431396484
6	0.5240488052368164
7	0.03569841384887695
8	0.04031229019165039
9	0.5298395156860352
10	0.04657340049743652
11	0.04925942420959473
12	0.09128570556640625
[DONE]
estimated time: 1.5949468612670898
1	0.11860251426696777
2	0.010329723358154297
3	0.011004447937011719
4	0.009334802627563477
5	0.003176450729370117
6	0.2578752040863037
7	0.019896268844604492
8	0.028748273849487305
9	0.5238699913024902
10	0.02834320068359375
11	0.027744770050048828
12	0.0414121150970459
[DONE]
estimated time: 1.0812382698059082
1	0.12682008743286133
2	0.010704517364501953
3	0.01178932189941

10	0.03614211082458496
11	0.03588247299194336
12	0.05728030204772949
[DONE]
estimated time: 1.2634894847869873
1	0.16272187232971191
2	0.011759281158447266
3	0.012911796569824219
4	0.012957096099853516
5	0.0034089088439941406
6	0.3875136375427246
7	0.028194189071655273
8	0.035829782485961914
9	0.5358617305755615
10	0.036649227142333984
11	0.03633737564086914
12	0.05726146697998047
[DONE]
estimated time: 1.3224201202392578
1	0.13338923454284668
2	0.012470722198486328
3	0.011600017547607422
4	0.011568784713745117
5	0.0033817291259765625
6	0.33843111991882324
7	0.025506258010864258
8	0.03396129608154297
9	0.5351824760437012
10	0.03462505340576172
11	0.035100698471069336
12	0.0576624870300293
[DONE]
estimated time: 1.2337191104888916
1	0.11912775039672852
2	0.010366439819335938
3	0.01112818717956543
4	0.009572267532348633
5	0.002872943878173828
6	0.2442305088043213
7	0.020034313201904297
8	0.03228878974914551
9	0.5294651985168457
10	0.02868056297302246
11	0.02751612663269043
12	0.039677858

1	0.11164426803588867
2	0.007802009582519531
3	0.01014089584350586
4	0.006890535354614258
5	0.0028557777404785156
6	0.15944957733154297
7	0.014484882354736328
8	0.027806997299194336
9	0.5308964252471924
10	0.024005889892578125
11	0.02339315414428711
12	0.029566287994384766
[DONE]
estimated time: 0.9499399662017822
1	0.26242613792419434
2	0.015475273132324219
3	0.014571666717529297
4	0.01730489730834961
5	0.005802631378173828
6	0.5384056568145752
7	0.03706240653991699
8	0.04474616050720215
9	0.5383589267730713
10	0.049779653549194336
11	0.05671501159667969
12	0.11068367958068848
[DONE]
estimated time: 1.6922225952148438
1	0.12460994720458984
2	0.00907444953918457
3	0.011274099349975586
4	0.008416175842285156
5	0.0031976699829101562
6	0.23365163803100586
7	0.018288850784301758
8	0.029586076736450195
9	0.5270781517028809
10	0.027797698974609375
11	0.029264450073242188
12	0.04434347152709961
[DONE]
estimated time: 1.0674772262573242
1	0.11783289909362793
2	0.008684158325195312
3	0.01073622