In [2]:
# Для каждого слова -- кроме самых частотных, служебных, числительных и именованных сущностей
# находим первую и последнюю дату упоминания.
import pickle
import pymorphy2
import pandas as pd
import json
from datetime import date
from ner.network import NER
from ner.corpus import Corpus
from nltk.corpus import stopwords

In [3]:
with open("texts.pkl", "rb") as file:
    data = pickle.load(file)

In [4]:
data = data.reset_index(drop=True)

In [5]:
data.head()

Unnamed: 0,source,author,text,date
0,Lenta.ru,Unknown,Вопреки апокалиптическим прогнозам американски...,2000-01-01
1,Интерфакс,Unknown,Мадрид не выдаст Венесуэле укрывшегося в посо...,2000-01-01
2,Интерфакс,Unknown,Пожар на складе резины произошел в промзоне Н...,2000-01-01
3,Интерфакс,Unknown,Полиция Нью-Йорка обвинила трех подростков в ...,2000-01-01
4,Интерфакс,Unknown,Трамп направил кандидатуру постпреда США при ...,2000-01-01


In [6]:
with open("most_frequent.txt", 'r') as file:
    most_freq = [word for word in file.read().split('\n')]

In [7]:
morph = pymorphy2.MorphAnalyzer()
stopwords = stopwords.words("russian")

def tokenize(text):
    return ([word.strip('.,:;?!()""''') for word in text.split(" ")])

def lemmatize(text):
    return ([morph.parse(word)[0].normal_form for word in tokenize(text)])

In [8]:
with open('model/params.json') as f:
    network_params = json.load(f)

corpus = Corpus(dicts_filepath='model/dict.txt')

network = NER(corpus, verbouse=False, pretrained_model_filepath='model/ner_model', **network_params)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from model/ner_model


In [9]:
def is_ok(word):
    if word in stopwords:
        return (False)
    if word in most_freq:
        return (False)
    if 'NUMR' in morph.parse(word)[0].tag:
        return (False)
    return (True)

In [13]:
reverse_index = {}

In [None]:
for i in range(len(data["text"])):
    if i % 100 == 0:
        with open("reverse_index.pkl", "wb") as file:
            pickle.dump(reverse_index, file)
    date = data["date"][i]
    text = lemmatize(data["text"][i])
    tags = network.predict_for_token_batch([text])[0]
    for word, tag in zip(text, tags):
        if tag == 'O' and is_ok(word):
            if word not in reverse_index:
                reverse_index[word] = [date, date]
            else:
                reverse_index[word][1] = date
    print(f"{i} texts parsed.")