# Кластеризация текстов

In [None]:
import itertools
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

from IPython.display import Image, SVG

%matplotlib inline

## Выборка

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
train_all = fetch_20newsgroups(subset='train')
print (train_all.target_names)

In [None]:
simple_dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'soc.religion.christian', 'rec.sport.hockey'])

Пример текста

In [None]:
print(simple_dataset.data[0])

### Признаки

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=500, min_df=10)
matrix = vectorizer.fit_transform(simple_dataset.data)
matrix.shape

## Аггломеративная кластеризация

In [None]:
from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete')
preds = model.fit_predict(matrix.toarray())

In [None]:
print(list(preds) [:10])

In [None]:
simple_dataset.target

In [None]:
preds

In [None]:
# Assessement
mapping = {2 : 1, 1: 2, 0: 0}
mapped_preds = [mapping[pred] for pred in preds]
# print (float(sum(mapped_preds != simple_dataset.target)) / len(simple_dataset.target))
print(accuracy_score(mapped_preds, simple_dataset.target))

In [None]:
def validate_with_mappings(preds, target):
    permutations = itertools.permutations([0, 1, 2])
    for a, b, c in permutations:
        mapping = {2 : a, 1: b, 0: c}
        mapped_preds = [mapping[pred] for pred in preds]
#         print (float(sum(mapped_preds != target)) / len(target))
        print(accuracy_score(mapped_preds, target))
    
validate_with_mappings(preds, simple_dataset.target)

## KMeans

In [None]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(matrix.toarray())
print (preds)
print (simple_dataset.target)
validate_with_mappings(preds, simple_dataset.target)

In [None]:
# Compare with Linear Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver = 'lbfgs', multi_class='auto')
print (cross_val_score(clf, matrix, simple_dataset.target, cv=3).mean())

**Вопрос:** очень высокая точность кластеризации текстов, очень близкая к точности Supervised алгоритма. Почему?

## Более сложная выборка

In [None]:
noteasy_dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'comp.os.ms-windows.misc', 'comp.graphics'])
matrix = vectorizer.fit_transform(noteasy_dataset.data)

In [None]:
model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(matrix.toarray())
print (preds)
print (noteasy_dataset.target)
validate_with_mappings(preds, noteasy_dataset.target)

In [None]:
clf = LogisticRegression(solver = 'lbfgs', multi_class='auto')
print (cross_val_score(clf, matrix, noteasy_dataset.target, cv=3).mean())

## SVD + KMeans

In [None]:
from sklearn.decomposition import TruncatedSVD

model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=1000, random_state=123)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
validate_with_mappings(preds, noteasy_dataset.target)

In [None]:
model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=200, random_state=321)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
validate_with_mappings(preds, noteasy_dataset.target)


**Вопрос:** всё равно сумели добиться довольно высокой точности. В чем причина?

# Продвинутые методы кластеризации текстов

Будем использовать библиотеку gensim: https://radimrehurek.com/gensim/ 

In [None]:
import gensim
import nltk
from nltk import word_tokenize
from collections import Counter
import string

nltk.download('punkt')
nltk.download('stopwords')

### Откроем датасет

In [None]:
twenty = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

### Выполним токенизацию с помощью nltk

In [None]:
docs = []
for text in twenty.data:
    tokens = [w.lower() for w in word_tokenize(text) if not w in string.punctuation]
    docs.append(tokens)

In [None]:
print('Токенизированный текст:')
print(docs[4])

### Удалим стоп-слова из текстов

Эти слова часто встречаются в текстах, вне зависимости от тематики. Поэтому в данной задаче они нам будут только мешаться.

In [None]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
print(stop)

In [None]:
new_docs = []
for tokens in docs:
    new_docs.append([token for token in tokens if not token in stop])
docs = new_docs

In [None]:
print(docs[4])

### Построим словарь по корпусу

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(docs)
print(dictionary)

In [None]:
# Отфильтруем словарь
dictionary.filter_extremes(no_below=2, no_above=1, keep_n=300000)
print(dictionary)

In [None]:
new_doc = "Hello world"
new_vec = dictionary.doc2bow(new_doc.lower().split())

In [None]:
new_vec

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
from gensim import models

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

### LSI

In [None]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)

In [None]:
corpus_lsi = lsi[corpus_tfidf]

In [None]:
lsi.print_topics(num_topics=10, num_words=20)

In [None]:
lsi.show_topic(6, topn=30)

### LDA

In [None]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=20, passes=10)

In [None]:
corpus_lda = lda[corpus]

In [None]:
lda.print_topics(20)

In [None]:
lda.show_topic(6, topn=30)

### Similarities

In [None]:
from gensim import similarities

index = similarities.MatrixSimilarity(corpus_lsi)

In [None]:
doc = docs[0]
vec_bow = dictionary.doc2bow(doc)
vec_lsi = lsi[tfidf[vec_bow]]
#vec_lda = lda[vec_bow]
print(vec_lsi)

In [None]:
sims = index[vec_lsi] # ищет похожие вектора
print(list(enumerate(sims)))

### Опциональное задание

- Подобрать параметры, чтобы получить более интерпретируемую картинку
- Избавиться от мусора в токенах (можно с помощью регулярных выражений)

## Word2Vec

Будем обучать word2vec из пакета gensim на корпусе opencorpora.<br/>
Этот корпус небольшой, поэтому выдающихся результатов ждать не стоит, но зато обучение будет происходить быстро.

In [None]:
import opencorpora

### Откроем корпус

In [None]:
corpus = opencorpora.CorpusReader('annot.opcorpora.xml')
docs = corpus.catalog()

In [None]:
for id, name in docs[100:110]:
    print(id, name)

In [None]:
corpus.parsed_sents(105)

### Посмотрим на контексты слов

In [None]:
from nltk.text import Text

all_tokens = []
for id, name in docs:
    all_tokens.extend(corpus.words(id))

textCorpus = Text(all_tokens)

In [None]:
textCorpus.concordance('король')

Среди токенов есть пунктуация, которая несет мало контекстной информации.

Так как текстов немного, матрицу term-context можно сделать более плотно заполненной, если использовать нормальзованную форму слов.<br/>
В корпусе эта информация уже есть, но если бы не было мы бы могли воспользоваться pymorphy.

In [None]:
for id, name in docs[103:104]:
    sentences = corpus.parsed_sents(id)
    for sentence in sentences:
        words = []
        for word_info in sentence:
            word = word_info[0]
            word_norm = word_info[1][0][0]
            word_tag = word_info[1][0][1]
            if word_tag != 'PNCT':
                words.append(word_norm)
            #print word, word_norm, word_tag
        print(' '.join(words))

### Обучим word2vec

In [None]:
import codecs, string
import pymorphy2


with codecs.open('opencorpora_for_word2vec.txt', 'w', encoding='utf-8') as f:
    morph = pymorphy2.MorphAnalyzer()
    for id, name in docs:
        sentences = corpus.parsed_sents(id)
        for sentence in sentences:
            #sentence = [w.lower() for w in sentence if not w in string.punctuation]
            #sentence = [morph.parse(w)[0].normal_form for w in sentence if not w in string.punctuation]
            words = []
            for word_info in sentence:
                word = word_info[0]
                word_norm = word_info[1][0][0]
                word_tag = word_info[1][0][1]
                if word_tag != 'PNCT':
                    words.append(word_norm)
            f.write(' '.join(words))
            f.write('\n')


In [None]:
# Посмотрим, что записалось в файл
!head -10 opencorpora_for_word2vec.txt

In [None]:
from gensim.models.word2vec import LineSentence, Word2Vec

sentences = LineSentence('opencorpora_for_word2vec.txt')

In [None]:
import gensim

model = Word2Vec(sentences, size=300, window=5, min_count=5, workers=4, iter=20)
model.init_sims(replace=True)

In [None]:
for w, sim in model.most_similar(positive=['Google']):
    print(w, sim)

In [None]:
for w, sim in model.most_similar(positive=['оператор']):
    print(w, sim)

In [None]:
for w, sim in model.most_similar(positive=['мальчик', 'женщина'], negative=['мужчина']):
    print(w, sim)

In [None]:
for w, sim in model.most_similar(positive=['ходить']):
    print(w, sim)

In [None]:
for w, sim in model.most_similar(positive=[u'брат', u'жена'], negative=[u'муж']):
    print(w, sim)

In [None]:
print(model.doesnt_match("книга журнал машина".split()))

In [None]:
model.similarity('книга', 'телефон')

In [None]:
model['книга']