In [2]:
# Для каждого слова -- кроме самых частотных, служебных, числительных и именованных сущностей
# находим первую и последнюю дату упоминания.
import pickle
import pymorphy2
import pandas as pd
import json
from datetime import date
from ner.network import NER
from ner.corpus import Corpus
from nltk.corpus import stopwords
from IPython.display import clear_output

In [3]:
with open("texts.pkl", "rb") as file:
    data = pickle.load(file)

In [4]:
data = data.reset_index(drop=True)

In [5]:
data.head()

Unnamed: 0,source,author,text,date
0,Lenta.ru,Unknown,Вопреки апокалиптическим прогнозам американски...,2000-01-01
1,Интерфакс,Unknown,Мадрид не выдаст Венесуэле укрывшегося в посо...,2000-01-01
2,Интерфакс,Unknown,Пожар на складе резины произошел в промзоне Н...,2000-01-01
3,Интерфакс,Unknown,Полиция Нью-Йорка обвинила трех подростков в ...,2000-01-01
4,Интерфакс,Unknown,Трамп направил кандидатуру постпреда США при ...,2000-01-01


In [6]:
with open("most_frequent.txt", 'r') as file:
    most_freq = [word for word in file.read().split('\n')]

In [7]:
morph = pymorphy2.MorphAnalyzer()
stopwords = stopwords.words("russian")

def tokenize(text):
    return ([word.strip('.,:;?!()""''') for word in text.split(" ")])

def lemmatize(text):
    return ([morph.parse(word)[0].normal_form for word in tokenize(text)])

In [8]:
with open('model/params.json') as f:
    network_params = json.load(f)

corpus = Corpus(dicts_filepath='model/dict.txt')

network = NER(corpus, verbouse=False, pretrained_model_filepath='model/ner_model', **network_params)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from model/ner_model


In [9]:
def is_ok(word):
    if word in stopwords:
        return (False)
    if word in most_freq:
        return (False)
    if 'NUMR' in morph.parse(word)[0].tag:
        return (False)
    return (True)

In [13]:
reverse_index = {}

In [14]:
for i in range(len(data["text"])):
    if i % 100 == 0:
        with open("reverse_index.pkl", "wb") as file:
            pickle.dump(reverse_index, file)
    date = data["date"][i]
    text = lemmatize(data["text"][i])
    tags = network.predict_for_token_batch([text])[0]
    for word, tag in zip(text, tags):
        if tag == 'O' and is_ok(word):
            if word not in reverse_index:
                reverse_index[word] = [date, date]
            else:
                reverse_index[word][1] = date
    clear_output()
    print(f"{i} texts parsed.")

0 texts parsed.
1 texts parsed.
2 texts parsed.
3 texts parsed.
4 texts parsed.
5 texts parsed.
6 texts parsed.
7 texts parsed.
8 texts parsed.
9 texts parsed.
10 texts parsed.
11 texts parsed.
12 texts parsed.
13 texts parsed.
14 texts parsed.
15 texts parsed.
16 texts parsed.
17 texts parsed.
18 texts parsed.
19 texts parsed.
20 texts parsed.
21 texts parsed.
22 texts parsed.
23 texts parsed.
24 texts parsed.
25 texts parsed.
26 texts parsed.
27 texts parsed.
28 texts parsed.
29 texts parsed.
30 texts parsed.
31 texts parsed.
32 texts parsed.
33 texts parsed.
34 texts parsed.
35 texts parsed.
36 texts parsed.
37 texts parsed.
38 texts parsed.
39 texts parsed.
40 texts parsed.
41 texts parsed.
42 texts parsed.
43 texts parsed.
44 texts parsed.
45 texts parsed.
46 texts parsed.
47 texts parsed.
48 texts parsed.
49 texts parsed.
50 texts parsed.
51 texts parsed.
52 texts parsed.
53 texts parsed.
54 texts parsed.
55 texts parsed.
56 texts parsed.
57 texts parsed.
58 texts parsed.
59 text

KeyboardInterrupt: 

In [15]:
reverse_index

{'вопреки': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 1)],
 'апокалиптический': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 1)],
 'прогноз': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 5)],
 'компьютерный': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 4)],
 'новогодний': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 5)],
 '1': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 5)],
 'январь': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 5)],
 '2000': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 3)],
 'катастрофа': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 4)],
 '-': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 5)],
 'глобальный': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 1)],
 'локальный': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 5)],
 'критический': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 3)],
 'жизнеобеспечение': [datetime.date(2000, 1, 1), datetime.date(2000, 1, 1)],
 'перенести': [datetime.da