# Кластеризация текстов

In [70]:
import itertools
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

from IPython.display import Image, SVG

%matplotlib inline


## Выборка

In [77]:
from sklearn.datasets import fetch_20newsgroups
train_all = fetch_20newsgroups(subset='train')
print (train_all.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [78]:
simple_dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'soc.religion.christian', 'rec.sport.hockey'])

simple_dataset.data[0]

'From: erik@cheshire.oxy.edu (Erik Adams)\nSubject: HELP!!  My Macintosh "luggable" has lines on its screen!\nOrganization: Occidental College, Los Angeles, CA 90041 USA.\nDistribution: comp\nLines: 20\n\nOkay, I don\'t use it very much, but I would like for it to keep working\ncorrectly, at least as long as Apple continues to make System software\nthat will run on it, if slowly :-)\n\nHere is the problem:  When the screen is tilted too far back, vertical\nlines appear on the screen.  They are every 10 pixels or so, and seem\nto be affected somewhat by opening windows and pulling down menus.\nIt looks to a semi-technical person like there is a loose connection\nbetween the screen and the rest of the computer.\n\nI am open to suggestions that do not involve buying a new computer,\nor taking this one to the shop.  I would also like to not have\nto buy one of Larry Pina\'s books.  I like Larry, but I\'m not sure\nI feel strongly enough about the computer to buy a service manual\nfor it.\n

In [79]:
print(simple_dataset.data[0])

From: erik@cheshire.oxy.edu (Erik Adams)
Subject: HELP!!  My Macintosh "luggable" has lines on its screen!
Organization: Occidental College, Los Angeles, CA 90041 USA.
Distribution: comp
Lines: 20

Okay, I don't use it very much, but I would like for it to keep working
correctly, at least as long as Apple continues to make System software
that will run on it, if slowly :-)

Here is the problem:  When the screen is tilted too far back, vertical
lines appear on the screen.  They are every 10 pixels or so, and seem
to be affected somewhat by opening windows and pulling down menus.
It looks to a semi-technical person like there is a loose connection
between the screen and the rest of the computer.

I am open to suggestions that do not involve buying a new computer,
or taking this one to the shop.  I would also like to not have
to buy one of Larry Pina's books.  I like Larry, but I'm not sure
I feel strongly enough about the computer to buy a service manual
for it.

On a related note:  what

### Признаки

In [80]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=500, min_df=10)
matrix = vectorizer.fit_transform(simple_dataset.data)
matrix.shape

(1777, 3767)

## Аггломеративная кластеризация

In [85]:
from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete')
preds = model.fit_predict(matrix.toarray())
list(preds)[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [86]:
simple_dataset.target

array([0, 0, 1, ..., 0, 1, 2])

In [87]:
preds

array([0, 0, 0, ..., 0, 2, 1])

In [88]:
# Assessement
mapping = {2 : 1, 1: 2, 0: 0}
mapped_preds = [mapping[pred] for pred in preds]
print (float(sum(mapped_preds != simple_dataset.target)) / len(simple_dataset.target))
print(accuracy_score(mapped_preds, simple_dataset.target))

0.6409679234665167
0.3590320765334834


In [89]:
def validate_with_mappings(preds, target):
    permutations = itertools.permutations([0, 1, 2])
    for a, b, c in permutations:
        mapping = {2 : a, 1: b, 0: c}
        mapped_preds = [mapping[pred] for pred in preds]
#         print (float(sum(mapped_preds != target)) / len(target))
        print(accuracy_score(mapped_preds, target))
    
validate_with_mappings(preds, simple_dataset.target)

0.325267304445695
0.3275182892515476
0.34721440630275746
0.3590320765334834
0.3157006190208216
0.325267304445695


## KMeans

In [90]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(matrix.toarray())
print (preds)
print (simple_dataset.target)
validate_with_mappings(preds, simple_dataset.target)

[0 0 2 ... 0 2 1]
[0 0 1 ... 0 1 2]
0.029262802476083285
0.32639279684862127
0.34946539110861
0.9527293190770962
0.018007878446820485
0.3241418120427687


In [91]:
# Compare with Linear Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver = 'lbfgs', multi_class='auto')
print (cross_val_score(clf, matrix, simple_dataset.target, cv=3).mean())

0.9864864479274559


**Вопрос:** очень высокая точность кластеризации текстов, очень близкая к точности Supervised алгоритма. Почему?

## Более сложная выборка

In [92]:
noteasy_dataset = fetch_20newsgroups(
    subset='train', 
    categories=['comp.sys.mac.hardware', 'comp.os.ms-windows.misc', 'comp.graphics'])
matrix = vectorizer.fit_transform(noteasy_dataset.data)

In [93]:
model = KMeans(n_clusters=3, random_state=1)
preds = model.fit_predict(matrix.toarray())
print (preds)
print (noteasy_dataset.target)
validate_with_mappings(preds, noteasy_dataset.target)

[0 1 2 ... 0 2 0]
[2 1 1 ... 2 0 2]
0.753565316600114
0.2966343411294923
0.39361095265259555
0.1289218482601255
0.11751283513976041
0.3097547062179121


In [94]:
clf = LogisticRegression(solver = 'lbfgs', multi_class='auto')
print (cross_val_score(clf, matrix, noteasy_dataset.target, cv=3).mean())

0.9178509820019255


## SVD + KMeans

In [95]:
from sklearn.decomposition import TruncatedSVD

model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=1000, random_state=123)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
validate_with_mappings(preds, noteasy_dataset.target)

0.4067313177410154
0.0889903023388477
0.793496862521392
0.29720479178551057
0.29606389047347403
0.11751283513976041


In [96]:
model = KMeans(n_clusters=3, random_state=42)
svd = TruncatedSVD(n_components=200, random_state=321)
features = svd.fit_transform(matrix)
preds = model.fit_predict(features)
validate_with_mappings(preds, noteasy_dataset.target)

0.286936679977182
0.15459212778094694
0.11066742726754136
0.2994865944095836
0.41357672561323444
0.7347404449515117



**Вопрос:** всё равно сумели добиться довольно высокой точности. В чем причина?

# Продвинутые методы кластеризации текстов

Будем использовать библиотеку gensim: https://radimrehurek.com/gensim/ 

In [97]:
import gensim
import nltk
from nltk import word_tokenize
from collections import Counter
import string

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/xenakas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/xenakas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Откроем датасет

In [98]:
twenty = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

### Выполним токенизацию с помощью nltk

In [99]:
docs = []
for text in twenty.data:
    tokens = [w.lower() for w in word_tokenize(text) if not w in string.punctuation]
    docs.append(tokens)

In [100]:
print('Токенизированный текст:')
print(docs[4])

Токенизированный текст:
['1', 'i', 'have', 'an', 'old', 'jasmine', 'drive', 'which', 'i', 'can', 'not', 'use', 'with', 'my', 'new', 'system', 'my', 'understanding', 'is', 'that', 'i', 'have', 'to', 'upsate', 'the', 'driver', 'with', 'a', 'more', 'modern', 'one', 'in', 'order', 'to', 'gain', 'compatability', 'with', 'system', '7.0.1.', 'does', 'anyone', 'know', 'of', 'an', 'inexpensive', 'program', 'to', 'do', 'this', 'i', 'have', 'seen', 'formatters', 'for', '20', 'buit', 'have', 'no', 'idea', 'if', 'they', 'will', 'work', '2', 'i', 'have', 'another', 'ancient', 'device', 'this', 'one', 'a', 'tape', 'drive', 'for', 'which', 'the', 'back', 'utility', 'freezes', 'the', 'system', 'if', 'i', 'try', 'to', 'use', 'it', 'the', 'drive', 'is', 'a', 'jasmine', 'direct', 'tape', 'bought', 'used', 'for', '150', 'w/', '6', 'tapes', 'techmar', 'mechanism', 'essentially', 'i', 'have', 'the', 'same', 'question', 'as', 'above', 'anyone', 'know', 'of', 'an', 'inexpensive', 'beckup', 'utility', 'i', 'can

### Удалим стоп-слова из текстов

Эти слова часто встречаются в текстах, вне зависимости от тематики. Поэтому в данной задаче они нам будут только мешаться.

In [101]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
print(stop)

{'her', 'while', 'o', 'which', 'its', 'having', 'did', 'not', 'what', 'is', 'during', 'on', 'hasn', 'can', "you'd", 'you', 'themselves', "aren't", 'your', 'he', "couldn't", 've', 'weren', 'here', 'a', 'been', 'me', 'there', 'does', 'yourself', 'below', "wouldn't", 'some', 're', "that'll", 'just', "hadn't", "hasn't", 'd', 'himself', 'shan', 'i', 'be', 'whom', 'more', 'we', 'again', 'those', 'she', 'up', 'both', 't', "weren't", "haven't", 'off', 'against', 'mustn', 'through', 'once', 'now', "you'll", 'most', 'his', 'above', 'herself', 's', 'than', 'myself', 'for', "needn't", 'into', 'have', 'out', 'was', 'because', "mustn't", 'such', 'so', 'm', 'our', 'aren', 'isn', 'by', 'am', 'as', 'to', 'has', "don't", 'do', 'nor', "won't", 'from', 'between', 'hadn', 'them', 'll', 'yourselves', 'they', "shan't", 'itself', "mightn't", 'until', 'at', 'this', 'won', 'an', 'their', 'my', 'ma', 'of', "wasn't", "didn't", 'before', 'or', 'ain', 'over', 'too', 'how', 'who', 'with', 'the', 'theirs', 'if', 'wer

In [102]:
new_docs = []
for tokens in docs:
    new_docs.append([token for token in tokens if not token in stop])
docs = new_docs

In [103]:
print(docs[4])

['1', 'old', 'jasmine', 'drive', 'use', 'new', 'system', 'understanding', 'upsate', 'driver', 'modern', 'one', 'order', 'gain', 'compatability', 'system', '7.0.1.', 'anyone', 'know', 'inexpensive', 'program', 'seen', 'formatters', '20', 'buit', 'idea', 'work', '2', 'another', 'ancient', 'device', 'one', 'tape', 'drive', 'back', 'utility', 'freezes', 'system', 'try', 'use', 'drive', 'jasmine', 'direct', 'tape', 'bought', 'used', '150', 'w/', '6', 'tapes', 'techmar', 'mechanism', 'essentially', 'question', 'anyone', 'know', 'inexpensive', 'beckup', 'utility', 'use', 'system', '7.0.1']


### Построим словарь по корпусу

In [104]:
from gensim import corpora

dictionary = corpora.Dictionary(docs)
print(dictionary)

Dictionary(198705 unique tokens: ['actually', 'also', 'anyway', 'bashers', 'beat']...)


In [105]:
# Отфильтруем словарь
dictionary.filter_extremes(no_below=2, no_above=1, keep_n=300000)
print(dictionary)

Dictionary(60494 unique tokens: ['actually', 'also', 'anyway', 'bashers', 'beat']...)


In [106]:
new_doc = "Hello world"
new_vec = dictionary.doc2bow(new_doc.lower().split())

In [107]:
new_vec

[(770, 1), (4686, 1)]

In [108]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [109]:
from gensim import models

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

### LSI

In [110]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)

In [111]:
corpus_lsi = lsi[corpus_tfidf]

In [112]:
lsi.print_topics(num_topics=10, num_words=20)

[(0,
  '0.987*"--" + 0.036*"\'\'" + 0.030*"``" + 0.026*"..." + 0.023*"n\'t" + 0.023*"\'s" + 0.019*"1" + 0.017*"would" + 0.016*"one" + 0.016*"0" + 0.014*"people" + 0.014*"get" + 0.014*"know" + 0.014*"2" + 0.014*"like" + 0.013*"\'m" + 0.012*"use" + 0.012*"think" + 0.012*"also" + 0.011*"time"'),
 (1,
  '0.214*"\'\'" + 0.184*"``" + 0.166*"n\'t" + 0.162*"..." + 0.154*"\'s" + -0.151*"--" + 0.140*"would" + 0.119*"one" + 0.112*"people" + 0.103*"like" + 0.100*"know" + 0.096*"get" + 0.095*"think" + 0.094*"god" + 0.090*"\'m" + 0.086*"could" + 0.078*"good" + 0.075*"use" + 0.075*"also" + 0.074*"time"'),
 (2,
  '-0.235*"god" + -0.216*"\'\'" + -0.208*"``" + 0.203*"windows" + 0.172*"thanks" + 0.154*"drive" + 0.152*"card" + 0.114*"please" + 0.108*"dos" + -0.107*"jesus" + -0.106*"people" + 0.106*"anyone" + 0.101*"file" + 0.092*"software" + 0.090*"program" + 0.087*"system" + 0.087*"disk" + 0.086*"advance" + 0.086*"pc" + 0.084*"scsi"'),
 (3,
  '0.656*"..." + -0.262*"god" + -0.178*"\'\'" + 0.155*"game" + -

In [113]:
lsi.show_topic(6, topn=30)

[('god', 0.43112193011738564),
 ('drive', 0.18825411269516154),
 ('1', 0.17475663540550607),
 ('jesus', 0.17305151695602514),
 ("''", -0.17053739499653683),
 ('``', -0.1637040634404436),
 ('key', -0.15851009066026522),
 ('government', -0.15403023540510616),
 ('game', 0.1347051238621458),
 ('2', 0.13256213972135655),
 ('0', 0.12733704522265324),
 ('encryption', -0.12267925407718164),
 ('chip', -0.10754708241057973),
 ('clipper', -0.10479511153088027),
 ('scsi', 0.10284578361146503),
 ('bible', 0.09399045956553387),
 ('card', 0.09387298091858082),
 ('keys', -0.0928926182277786),
 ('games', 0.09131130736808389),
 ('christ', 0.08596529693579624),
 ('x', -0.08106906443393129),
 ('3', 0.08039785380422805),
 ('ide', 0.07844708772380066),
 ('program', -0.07320755043304271),
 ('sin', 0.0729387266982728),
 ('faith', 0.07287550623489923),
 ('file', -0.07245234803682975),
 ('controller', 0.06820121374267508),
 ('algorithm', -0.06727377212382457),
 ('4', 0.06696456212730935)]

### LDA

In [114]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=20, passes=10)

In [115]:
corpus_lda = lda[corpus]

In [116]:
lda.print_topics(20)

[(0,
  '0.021*"space" + 0.009*"earth" + 0.008*"\'s" + 0.007*"nasa" + 0.007*"launch" + 0.006*"system" + 0.005*"orbit" + 0.005*"moon" + 0.005*"solar" + 0.005*"spacecraft"'),
 (1,
  '0.031*"image" + 0.012*"images" + 0.010*"file" + 0.009*"color" + 0.008*"format" + 0.008*"files" + 0.007*"\'\'" + 0.007*"jpeg" + 0.007*"\'s" + 0.006*"program"'),
 (2,
  '0.013*"\'\'" + 0.010*"1993" + 0.009*"university" + 0.008*"information" + 0.007*"research" + 0.006*"new" + 0.005*"1992" + 0.005*"may" + 0.005*"number" + 0.005*"april"'),
 (3,
  '0.766*"--" + 0.006*"\'\'" + 0.004*"file" + 0.003*"``" + 0.001*"-+" + 0.001*"linux" + 0.001*"..." + 0.001*"**" + 0.001*"banks" + 0.001*"plotting"'),
 (4,
  '0.035*"..." + 0.009*"car" + 0.009*"new" + 0.008*"price" + 0.007*"power" + 0.006*"used" + 0.006*"sale" + 0.006*"one" + 0.005*"bike" + 0.005*"db"'),
 (5,
  '0.029*"\'\'" + 0.026*"``" + 0.010*"people" + 0.006*"israel" + 0.005*"right" + 0.005*"jews" + 0.005*"rights" + 0.005*"government" + 0.005*"state" + 0.004*"would"'),


In [117]:
lda.show_topic(6, topn=30)

[('55.0', 0.011677968),
 ('0.333', 0.006533844),
 ('***', 0.006266878),
 ('undefined', 0.00486619),
 ('doug', 0.0047875685),
 ('canada', 0.004366962),
 ('0.500', 0.004243184),
 ('explosive', 0.0042404816),
 ('sweden', 0.0041691735),
 ('symbol', 0.0040060445),
 ('usa', 0.0039615445),
 ('germany', 0.003913333),
 ('finals', 0.0034833371),
 ('dr', 0.0033989511),
 ('finland', 0.0033640002),
 ('__', 0.0033343998),
 ('norway', 0.0032179386),
 ('oak', 0.0031698304),
 ('pit-6', 0.0029380147),
 ('../.././lib/xmu/libxmu.a', 0.0029265028),
 ('wires', 0.0028735097),
 ('lw', 0.00287048),
 ('=====', 0.0027887423),
 ('morgan', 0.0027773825),
 ('0.273', 0.0027521474),
 ('apr', 0.0027402071),
 ('april', 0.0026895397),
 ('0.200', 0.0026613637),
 ('0.167', 0.002615972),
 ('stat', 0.0026157189)]

### Similarities

In [118]:
from gensim import similarities

index = similarities.MatrixSimilarity(corpus_lsi)

  if np.issubdtype(vec.dtype, np.int):


In [119]:
doc = docs[0]
vec_bow = dictionary.doc2bow(doc)
vec_lsi = lsi[tfidf[vec_bow]]
vec_lda = lda[vec_bow]
print(vec_lsi)

[(0, 0.012294209326910215), (1, 0.08788160490868245), (2, -0.01077936796964885), (3, 0.08264324854113354), (4, -0.08521772068146756), (5, -0.03345844047204606), (6, 0.053834666936840504), (7, -0.06268512609714466), (8, -0.055009839315937246), (9, -0.025090565749556138)]


In [120]:
sims = index[vec_lsi] # ищет похожие вектора
print(list(enumerate(sims)))

[(0, 1.0), (1, -0.0286199), (2, 0.26934102), (3, -0.101161495), (4, 0.037116453), (5, 0.5679223), (6, 0.3188602), (7, 0.9566994), (8, 0.8881528), (9, 0.023072377), (10, 0.1722173), (11, -0.003392011), (12, 0.40444642), (13, 0.1500312), (14, -0.003639062), (15, 0.17351604), (16, 0.00012557954), (17, 0.23524696), (18, 0.18189618), (19, 0.5263263), (20, 0.6139276), (21, 0.17436595), (22, 0.40416464), (23, 0.7514841), (24, 0.6830578), (25, 0.45408908), (26, 0.5208161), (27, 0.021018922), (28, 0.07099686), (29, 0.22004646), (30, 0.5151261), (31, 0.21970949), (32, 0.010913655), (33, 0.8440834), (34, 0.5738544), (35, 0.17765981), (36, 0.38165605), (37, 0.098305255), (38, 0.48940593), (39, 0.43779647), (40, 0.66334033), (41, 0.26625776), (42, 0.61184007), (43, -0.05215902), (44, 0.0798256), (45, 0.06362276), (46, 0.1111197), (47, 0.61415786), (48, 0.3046473), (49, -0.08550966), (50, 0.26826575), (51, 0.595222), (52, 0.003801167), (53, -0.20201646), (54, 0.5807935), (55, 0.6323057), (56, 0.2830

### Опциональное задание

- Подобрать параметры, чтобы получить более интерпретируемую картинку
- Избавиться от мусора в токенах (можно с помощью регулярных выражений)

## Word2Vec

Будем обучать word2vec из пакета gensim на корпусе opencorpora.<br/>
Этот корпус небольшой, поэтому выдающихся результатов ждать не стоит, но зато обучение будет происходить быстро.

##### [OpenCorpora](https://github.com/kmike/opencorpora-tools)

In [1]:
import opencorpora

### Откроем корпус

In [2]:
corpus = opencorpora.load('/home/xenakas/Desktop/Git/annot.opcorpora.xml')

In [3]:
corpus = opencorpora.CorpusReader('/home/xenakas/Desktop/Git/annot.opcorpora.xml')

In [10]:
docs = corpus.catalog()

In [8]:
docs

[]

In [5]:
corpus.catalog()

[]

In [9]:
for id, name in docs[100:110]:
    print(id, name)

In [153]:
corpus.parsed_sents(105)

KeyError: '105'

### Посмотрим на контексты слов

In [154]:
from nltk.text import Text

all_tokens = []
for id, name in docs:
    all_tokens.extend(corpus.words(id))

textCorpus = Text(all_tokens)

In [155]:
textCorpus.concordance('король')

no matches


Среди токенов есть пунктуация, которая несет мало контекстной информации.

Так как текстов немного, матрицу term-context можно сделать более плотно заполненной, если использовать нормальзованную форму слов.<br/>
В корпусе эта информация уже есть, но если бы не было мы бы могли воспользоваться pymorphy.

In [None]:
for id, name in docs[103:104]:
    sentences = corpus.parsed_sents(id)
    for sentence in sentences:
        words = []
        for word_info in sentence:
            word = word_info[0]
            word_norm = word_info[1][0][0]
            word_tag = word_info[1][0][1]
            if word_tag != 'PNCT':
                words.append(word_norm)
            #print word, word_norm, word_tag
        print(' '.join(words))

### Обучим word2vec

In [None]:
import codecs, string
import pymorphy2


with codecs.open('opencorpora_for_word2vec.txt', 'w', encoding='utf-8') as f:
    morph = pymorphy2.MorphAnalyzer()
    for id, name in docs:
        sentences = corpus.parsed_sents(id)
        for sentence in sentences:
            #sentence = [w.lower() for w in sentence if not w in string.punctuation]
            #sentence = [morph.parse(w)[0].normal_form for w in sentence if not w in string.punctuation]
            words = []
            for word_info in sentence:
                word = word_info[0]
                word_norm = word_info[1][0][0]
                word_tag = word_info[1][0][1]
                if word_tag != 'PNCT':
                    words.append(word_norm)
            f.write(' '.join(words))
            f.write('\n')


In [None]:
# Посмотрим, что записалось в файл
!head -10 opencorpora_for_word2vec.txt

In [None]:
from gensim.models.word2vec import LineSentence, Word2Vec

sentences = LineSentence('opencorpora_for_word2vec.txt')

In [None]:
import gensim

model = Word2Vec(sentences, size=300, window=5, min_count=5, workers=4, iter=20)
model.init_sims(replace=True)

In [None]:
for w, sim in model.most_similar(positive=['Google']):
    print(w, sim)

In [None]:
for w, sim in model.most_similar(positive=['оператор']):
    print(w, sim)

In [None]:
for w, sim in model.most_similar(positive=['мальчик', 'женщина'], negative=['мужчина']):
    print(w, sim)

In [None]:
for w, sim in model.most_similar(positive=['ходить']):
    print(w, sim)

In [None]:
for w, sim in model.most_similar(positive=[u'брат', u'жена'], negative=[u'муж']):
    print(w, sim)

In [None]:
print(model.doesnt_match("книга журнал машина".split()))

In [None]:
model.similarity('книга', 'телефон')

In [None]:
model['книга']