In [15]:
from functools import lru_cache
import pymorphy2
from ruwordnet import RuWordNet
import json
from os import listdir
from os.path import isfile, join
from const import DAMP_OF_WIKIDATA_PATh, RUWORDNET_PATH
from emoji import UNICODE_EMOJI

In [16]:
def is_lat(s):
    for char in s:
        if char.isalpha() or char.isdigit() or char == " " or char == "-" or char == ':':
            pass
        else:
            return False
    return not(s in UNICODE_EMOJI)
@lru_cache(maxsize=200000)
def get_normal_form(word):
    return morph_analizer.parse(word)[0].normal_form
morph_analizer = pymorphy2.MorphAnalyzer()
wn = RuWordNet(filename_or_session=RUWORDNET_PATH)

In [17]:
set_senses = set([' '.join([get_normal_form(w).lower() for w in s.lemma.split()]) for s in wn.senses])

In [18]:
onlyfiles = [f for f in listdir(DAMP_OF_WIKIDATA_PATh) if isfile(join(DAMP_OF_WIKIDATA_PATh, f))]

In [19]:
def my_split(x):
    s = ""
    for i in x:
        if i!= "(" and i != ")":
            s +=i
        elif i  == ")":
            s += ""
        else:
            s += ","
    return s
def TitleInWn(all_senses, title):
    title = title.lower()
    title = title.replace("—", "-")
    title = title.replace(",", "")
    if title in all_senses:
        return title
    if "(" in title:
        text = my_split(title).split(",")
        if text[0] in all_senses:
            return text[0]
    text = my_split(title).split(",")
    lemmatized = " ".join([get_normal_form(word).lower()
                for word in text[0].split()])
    if lemmatized in all_senses:
        return lemmatized
    if "ё" in title:
        return TitleInWn(all_senses, title.replace("ё","е"))
    return None

In [21]:
tweets = []
to_add = set()
for file in onlyfiles:
    with open(f'{DAMP_OF_WIKIDATA_PATh}\\{file}', 'r', encoding='utf-8') as f:
        for line in f:
            info = json.loads(line)
            if 'ru' in info['label'] and (not is_lat(info["label"]['ru'])):
                label = my_split(info["label"]['ru']).split(',')[0]
                lemma = TitleInWn(set_senses, label)
                if lemma is not None:
                    tweets.append((info, lemma))
                    if info['rels']:
                        for elem in info['rels']:
                            if elem['type'] == 'subclass_of' or elem['type'] == 'part_of':
                                to_add.add(elem['rel_id'])

In [22]:
len(tweets)

59030

In [23]:
# dict_id_to_idx = {tweet['rels'][0]['rel_id']:idx for idx, (tweet, _) in enumerate(tweets) if tweet['rels']}

In [24]:
for file in onlyfiles:
    with open(f'{DAMP_OF_WIKIDATA_PATh}\\{file}', 'r', encoding='utf-8') as f:
        for line in f:
            info = json.loads(line)
            if 'ru' in info['label'] and info['id'] in to_add:
                label = my_split(info["label"]['ru']).split(',')[0]
                lemma = TitleInWn(set_senses, label)
                if lemma is not None:
                    tweets.append((info, lemma))


In [25]:
len(tweets)

59506

In [64]:
tweets[123]

({'id': 'Q203982',
  'pageid': 200409,
  'label': {'en': 'Adam Małysz', 'ru': 'Малыш, Адам'},
  'descriptions': {'en': 'Polish former ski jumper'},
  'aliases': {'ru': ['Малыш А.', 'Адам Малыш', 'Малыш Адам', 'Adam Małysz'],
   'en': ['Adam Malysz']},
  'sitelinks': {'ruwiki': 'Малыш, Адам', 'enwiki': 'Adam Małysz'},
  'rels': [{'rel_id': 'Q5', 'rank': 'normal', 'type': 'instance_of'}]},
 'малыш')