In [1]:
import wikipedia
import nltk
import re
from pymystem3 import Mystem
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/a.tsigankov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# for pretty output
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [3]:
pd.set_option('max_colwidth', 80)
wikipedia.set_lang('ru')
tokenizer = nltk.RegexpTokenizer(r"\w+")
mystem = Mystem()

Факты

In [4]:
facts = [
    "Нацистский врач-эсэсовец, работавший в шести концлагерях, был дважды оправдан и выпущен на свободу.",
    "Историк утверждает, что, прежде чем допустить крестьян на личный прием к вождю мирового пролетариата, их тщательно дезинфицировали.",
    "В России акции протеста проходят не только на площадях, но и на поездах."
]

Название статей Википедии, на которые ссылаются факты

In [5]:
wikipedia_pages = {
    1: ["Шмидт, Генрих (врач)"],
    2: ["Ходоки у В. И. Ленина", "Дезинфекция"],
    3: ["Проезд снаружи поездов"]
}

### Скачивание и обработка статей из Википедии

In [6]:
def get_page_text(pagename):
    page = wikipedia.page(pagename)
    return page.content

In [7]:
def remove_trash(page_text):
    page_text = re.sub('\s', ' ', page_text)
    page_text = re.sub('[А-Я]\.', '', page_text)
    page_text = re.sub(r'\=\= Примечания[\w\s\=]*', '', page_text)
    page_text = page_text.replace('.', '. ')
    page_text = page_text.replace('\n', ' ')
    headers = re.findall(r"\=\=.*?\=\=", page_text)
    for header in headers:
        page_text = page_text.replace(header, '')
    
    return page_text

In [None]:
documents = []

for num in wikipedia_pages.keys():
    for page in wikipedia_pages[num]:
        page_text = remove_trash(get_page_text(page))
        sentences = nltk.sent_tokenize(page_text, language="russian")
        documents.extend(sentences)

In [None]:
# добавление фактов
documents.extend(facts)

In [None]:
wiki = pd.DataFrame(documents)

In [None]:
wiki.columns = ['document']

In [None]:
wiki.head()

### Лемматизация 

In [None]:
def lemmatize(sentence):
    return ''.join((mystem.lemmatize(' '.join(tokenizer.tokenize(sentence))))).replace('\n', '')

In [None]:
wiki['lemmatized_document'] = wiki.document.apply(lemmatize)

In [None]:
wiki.tail()

In [None]:
tfidf_vect = TfidfVectorizer()
tfidf_weight = tfidf_vect.fit_transform(wiki['lemmatized_document'])

In [None]:
nn_cosine = NearestNeighbors(metric='cosine')
nn_cosine.fit(tfidf_weight)

In [None]:
facts_indexes = [563, 564, 565]

In [None]:
def get_nearest_documents(index):
    cosine, indices = nn_cosine.kneighbors(tfidf_weight[index], n_neighbors = 11)

    neighbors_cosine = pd.DataFrame({'cosine': cosine.flatten(), 'id': indices.flatten()})

    nearest_documents = (wiki.\
                    merge(neighbors_cosine, right_on = 'id', left_index = True).\
                    sort_values('cosine')[['id', 'document', 'cosine']])

    return nearest_documents

def print_results_for_document(index):
    nearest_documents = get_nearest_documents(index)
    documents = nearest_documents['document'].values
    cosines = nearest_documents['cosine'].values
    
    fact = documents[0]
    
    printmd("Факт: " + "**" + fact + "**")
    printmd("Близкие документы:")
    
    for doc, cosine in zip(documents[1:], cosines[1:]):
        printmd(f"* [{cosine:.2f}] {doc}")

In [None]:
print_results_for_document(facts_indexes[0])

In [None]:
print_results_for_document(facts_indexes[1])

In [None]:
print_results_for_document(facts_indexes[2])