# Продвинутые методы кластеризации текстов

Будем использовать библиотеку gensim: https://radimrehurek.com/gensim/ – обновите вирутальное окружение

In [1]:
import gensim

### Откроем датасет

In [2]:
from sklearn.datasets import fetch_20newsgroups

twenty = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

### Выполним токенизацию с помощью nltk

In [3]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/tylorn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from nltk import word_tokenize
from collections import Counter
import string

docs = []
for text in twenty.data:
    tokens = [w.lower() for w in word_tokenize(text) if not w in string.punctuation]
    docs.append(tokens)

In [5]:
print('Токенизированный текст:')
print(docs[4])

Токенизированный текст:
['1', 'i', 'have', 'an', 'old', 'jasmine', 'drive', 'which', 'i', 'can', 'not', 'use', 'with', 'my', 'new', 'system', 'my', 'understanding', 'is', 'that', 'i', 'have', 'to', 'upsate', 'the', 'driver', 'with', 'a', 'more', 'modern', 'one', 'in', 'order', 'to', 'gain', 'compatability', 'with', 'system', '7.0.1.', 'does', 'anyone', 'know', 'of', 'an', 'inexpensive', 'program', 'to', 'do', 'this', 'i', 'have', 'seen', 'formatters', 'for', '20', 'buit', 'have', 'no', 'idea', 'if', 'they', 'will', 'work', '2', 'i', 'have', 'another', 'ancient', 'device', 'this', 'one', 'a', 'tape', 'drive', 'for', 'which', 'the', 'back', 'utility', 'freezes', 'the', 'system', 'if', 'i', 'try', 'to', 'use', 'it', 'the', 'drive', 'is', 'a', 'jasmine', 'direct', 'tape', 'bought', 'used', 'for', '150', 'w/', '6', 'tapes', 'techmar', 'mechanism', 'essentially', 'i', 'have', 'the', 'same', 'question', 'as', 'above', 'anyone', 'know', 'of', 'an', 'inexpensive', 'beckup', 'utility', 'i', 'can

### Удалим стоп-слова из текстов

Эти слова часто встречаются в текстах, вне зависимости от тематики. Поэтому в данной задаче они нам будут только мешаться.

In [6]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
print(stop)

{'these', 'what', 'should', 'won', 'most', 'there', 'me', "that'll", 'down', "hasn't", 'own', 'she', "mightn't", "should've", 'to', 'am', 'does', 'some', 'or', 'shouldn', "you'd", 'so', 'their', 'i', 'more', 'can', 're', 'her', 'which', 'do', 'under', 'them', 'hers', 'having', 'couldn', 'if', 'themselves', 'our', 'your', 've', 'and', 'an', 'above', 'after', 'needn', 'itself', 'doing', 'not', 'himself', 'for', 'are', 'd', 'ain', 'they', 'below', 'again', 'hadn', 'those', 'now', 'mustn', "you're", 'with', 'against', "won't", 'off', 'have', "wasn't", 'we', 'any', 'by', 'yourselves', "aren't", 'further', 'until', 'a', 'nor', 'ours', 't', 'doesn', 'all', 'very', 'while', "you'll", 'haven', "don't", 'over', "shouldn't", "weren't", 'weren', 'it', 'then', 'you', 'whom', "haven't", 'yourself', 'is', 'but', "isn't", 'mightn', 'at', 'yours', 'll', "didn't", 'isn', 'he', 'same', "she's", 'herself', 'in', 'such', 'how', 'up', "you've", 'will', 'o', 'be', 'during', 'wasn', 'its', 'other', 'been', 'a

In [7]:
new_docs = []
for tokens in docs:
    new_docs.append([token for token in tokens if not token in stop])
docs = new_docs

In [8]:
print(docs[4])

['1', 'old', 'jasmine', 'drive', 'use', 'new', 'system', 'understanding', 'upsate', 'driver', 'modern', 'one', 'order', 'gain', 'compatability', 'system', '7.0.1.', 'anyone', 'know', 'inexpensive', 'program', 'seen', 'formatters', '20', 'buit', 'idea', 'work', '2', 'another', 'ancient', 'device', 'one', 'tape', 'drive', 'back', 'utility', 'freezes', 'system', 'try', 'use', 'drive', 'jasmine', 'direct', 'tape', 'bought', 'used', '150', 'w/', '6', 'tapes', 'techmar', 'mechanism', 'essentially', 'question', 'anyone', 'know', 'inexpensive', 'beckup', 'utility', 'use', 'system', '7.0.1']


### Построим словарь по корпусу

In [9]:
from gensim import corpora

dictionary = corpora.Dictionary(docs)
print(dictionary)

Dictionary(198027 unique tokens: ['actually', 'also', 'anyway', 'bashers', 'beat']...)


In [10]:
# Отфильтруем словарь
dictionary.filter_extremes(no_below=2, no_above=1, keep_n=300000)
print(dictionary)

Dictionary(60483 unique tokens: ['actually', 'also', 'anyway', 'bashers', 'beat']...)


In [11]:
new_doc = "Hello world"
new_vec = dictionary.doc2bow(new_doc.lower().split())

In [12]:
new_vec

[(770, 1), (4686, 1)]

In [13]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [14]:
from gensim import models

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

### LSI

In [15]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)

In [16]:
corpus_lsi = lsi[corpus_tfidf]

In [17]:
lsi.print_topics(num_topics=10, num_words=20)

[(0,
  '0.987*"--" + 0.036*"\'\'" + 0.030*"``" + 0.026*"..." + 0.023*"n\'t" + 0.023*"\'s" + 0.019*"1" + 0.017*"would" + 0.016*"one" + 0.016*"0" + 0.014*"people" + 0.014*"get" + 0.014*"know" + 0.014*"2" + 0.014*"like" + 0.013*"\'m" + 0.012*"use" + 0.012*"think" + 0.012*"also" + 0.011*"time"'),
 (1,
  '-0.214*"\'\'" + -0.184*"``" + -0.166*"n\'t" + -0.162*"..." + -0.154*"\'s" + 0.151*"--" + -0.140*"would" + -0.119*"one" + -0.112*"people" + -0.103*"like" + -0.100*"know" + -0.096*"get" + -0.095*"think" + -0.094*"god" + -0.090*"\'m" + -0.086*"could" + -0.078*"good" + -0.075*"also" + -0.075*"use" + -0.074*"time"'),
 (2,
  '0.234*"god" + 0.215*"\'\'" + 0.208*"``" + -0.204*"windows" + -0.173*"thanks" + -0.153*"drive" + -0.151*"card" + -0.116*"please" + -0.109*"dos" + 0.106*"jesus" + 0.105*"people" + -0.105*"anyone" + -0.101*"file" + -0.093*"software" + -0.089*"program" + -0.088*"disk" + -0.088*"system" + -0.087*"advance" + -0.086*"pc" + -0.084*"scsi"'),
 (3,
  '-0.655*"..." + 0.261*"god" + 0.17

In [18]:
lsi.show_topic(6, topn=30)

[('god', 0.4280338535388315),
 ('drive', 0.18953382098086954),
 ('jesus', 0.17469414588976204),
 ("''", -0.17314524952045643),
 ('1', 0.17074046105858176),
 ('``', -0.16517557979445016),
 ('key', -0.15632796434255716),
 ('government', -0.15178672181800817),
 ('game', 0.1365068147692675),
 ('2', 0.12960960231628751),
 ('encryption', -0.12566678219629587),
 ('0', 0.12347349406420842),
 ('chip', -0.10918863740833173),
 ('clipper', -0.10516412592876272),
 ('scsi', 0.09569790300650378),
 ('games', 0.09443961243787354),
 ('card', 0.09318871005429562),
 ('keys', -0.09270574873104238),
 ('x', -0.09082989489227028),
 ('bible', 0.08994847137919082),
 ('christ', 0.08532847234827436),
 ('3', 0.0802715376362297),
 ('faith', 0.07597612076700522),
 ('ide', 0.07577522898844595),
 ('file', -0.07526622045519835),
 ('sin', 0.07369835959384831),
 ('controller', 0.06953142423576955),
 ('escrow', -0.06800387819384468),
 ('4', 0.06763677094950721),
 ('christian', 0.06733144639559693)]

### LDA

In [19]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=20, passes=10)

In [20]:
corpus_lda = lda[corpus]

In [21]:
lda.print_topics(20)

[(0,
  '0.008*"one" + 0.008*"car" + 0.007*"\'s" + 0.007*"would" + 0.006*"like" + 0.006*"n\'t" + 0.005*"used" + 0.005*"also" + 0.005*"power" + 0.005*"use"'),
 (1,
  '0.676*"--" + 0.004*"\'\'" + 0.002*"_/" + 0.002*"file" + 0.002*"tor" + 0.002*"det" + 0.001*"bos" + 0.001*"``" + 0.001*"1" + 0.001*"pts"'),
 (2,
  '0.022*"drive" + 0.012*"disk" + 0.010*"hard" + 0.010*"price" + 0.009*"sale" + 0.009*"please" + 0.008*"new" + 0.008*"drives" + 0.008*"offer" + 0.008*"scsi"'),
 (3,
  '0.035*"n\'t" + 0.025*"\'s" + 0.015*"would" + 0.013*"\'\'" + 0.012*"one" + 0.011*"``" + 0.010*"like" + 0.010*"think" + 0.009*"know" + 0.009*"get"'),
 (4,
  '0.021*"game" + 0.017*"team" + 0.015*"\'s" + 0.014*"year" + 0.013*"games" + 0.009*"season" + 0.009*"hockey" + 0.008*"last" + 0.008*"play" + 0.007*"players"'),
 (5,
  '0.020*"card" + 0.010*"mac" + 0.010*"monitor" + 0.009*"video" + 0.009*"board" + 0.009*"db" + 0.008*"apple" + 0.008*"memory" + 0.007*"modem" + 0.007*"bus"'),
 (6,
  '0.009*"\'\'" + 0.008*"space" + 0.007*"

In [22]:
lda.show_topic(6, topn=30)

[("''", 0.008547938),
 ('space', 0.00807677),
 ('``', 0.0071484935),
 ('use', 0.006715425),
 ('medical', 0.0060977456),
 ('health', 0.00551356),
 ('also', 0.0045722793),
 ('launch', 0.0045270007),
 ('research', 0.0044693006),
 ('number', 0.0044362526),
 ('program', 0.003945844),
 ('disease', 0.0038403259),
 ('may', 0.003762252),
 ('drug', 0.0037017886),
 ('years', 0.0036456059),
 ("'s", 0.0034633416),
 ('patients', 0.0033457),
 ('aids', 0.003273301),
 ('1993', 0.0031918862),
 ('rate', 0.0031273786),
 ('study', 0.003089018),
 ('age', 0.003006778),
 ('cancer', 0.0029866612),
 ('reported', 0.0029654019),
 ('data', 0.002962463),
 ('shuttle', 0.0029469475),
 ('nasa', 0.0029422925),
 ('treatment', 0.002883637),
 ('april', 0.0028814469),
 ('low', 0.0028571167)]

### Similarities

In [23]:
from gensim import similarities

index = similarities.MatrixSimilarity(corpus_lsi)

  if np.issubdtype(vec.dtype, np.int):


In [24]:
doc = docs[0]
vec_bow = dictionary.doc2bow(doc)
vec_lsi = lsi[tfidf[vec_bow]]
vec_lda = lda[vec_bow]
print(vec_lda)

[(3, 0.5968805), (4, 0.26077282), (7, 0.070963666), (16, 0.039297562), (19, 0.021371132)]


In [25]:
sims = index[vec_lsi] # ищет похожие вектора
print(list(enumerate(sims)))

[(0, 1.0), (1, -0.016020186), (2, 0.24476933), (3, -0.13402167), (4, 0.018832482), (5, 0.52422357), (6, 0.30389732), (7, 0.9549281), (8, 0.8806156), (9, 0.0059505217), (10, 0.19897309), (11, -0.016295858), (12, 0.39718354), (13, 0.12901296), (14, -0.0032772087), (15, 0.17627211), (16, -0.018927034), (17, 0.21632572), (18, 0.14980677), (19, 0.5041377), (20, 0.6059541), (21, 0.19476672), (22, 0.4055552), (23, 0.72389776), (24, 0.67794764), (25, 0.4351244), (26, 0.5370468), (27, 0.032690026), (28, 0.07486583), (29, 0.19004422), (30, 0.47575748), (31, 0.14667453), (32, 0.0076875687), (33, 0.86102754), (34, 0.55279696), (35, 0.18732095), (36, 0.38883895), (37, 0.14185339), (38, 0.48589802), (39, 0.428982), (40, 0.66759175), (41, 0.26409405), (42, 0.62091035), (43, -0.06595177), (44, 0.079058126), (45, 0.037893936), (46, 0.11635432), (47, 0.6117934), (48, 0.31287283), (49, -0.092176564), (50, 0.25913638), (51, 0.57340825), (52, 0.0030790581), (53, -0.21169293), (54, 0.5895906), (55, 0.616497

### Опциональное задание

- Подобрать параметры, чтобы получить более интерпретируемую картинку
- Избавиться от мусора в токенах (можно с помощью регулярных выражений)