# 词向量（Word2Vec）

In [1]:
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset='all')

X, y = news.data, news.target

from bs4 import BeautifulSoup

In [2]:
import nltk, re


def news_to_sentences(news):
    news_text = BeautifulSoup(news, "lxml").get_text()

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(news_text)

    sentences = []

    for sent in raw_sentences:
        sentences.append(
            re.sub('[^a-zA-Z]', ' ',
                   sent.lower().strip()).split())

    return sentences

In [3]:
sentences = []

for x in X:
    sentences += news_to_sentences(x)

In [4]:
len(sentences)

303560

In [5]:
from gensim.models import word2vec

# Set values for various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 20  # Minimum word count
num_workers = 2  # Number of threads to run in parallel
context = 5  # Context window size
downsampling = 1e-3  # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers,
                          size=num_features, min_count=min_word_count,
                          window=context, sample=downsampling)

# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

In [6]:
model.wv.most_similar('morning')

[('afternoon', 0.8302181959152222),
 ('weekend', 0.7642990350723267),
 ('evening', 0.7547295093536377),
 ('saturday', 0.7247110605239868),
 ('night', 0.7060050964355469),
 ('friday', 0.6802293658256531),
 ('sunday', 0.6414958238601685),
 ('newspaper', 0.6384304761886597),
 ('summer', 0.6323332786560059),
 ('thursday', 0.6183165311813354)]

In [7]:
model.wv.most_similar('email')

[('mail', 0.7392966747283936),
 ('contact', 0.6957465410232544),
 ('address', 0.6555604934692383),
 ('replies', 0.6515315771102905),
 ('mailed', 0.6399132013320923),
 ('request', 0.6332578659057617),
 ('send', 0.6203031539916992),
 ('listserv', 0.6189593076705933),
 ('sas', 0.616413950920105),
 ('compuserve', 0.590954065322876)]