In [1]:
import os 
import pathlib

In [2]:
BASE_PATH = str(pathlib.Path(os.path.abspath('__file__')).parent)
print('根目录: ', BASE_PATH)

根目录:  /root/share/HCLG/project2


## 使用gensim进行词向量训练

In [None]:
with open(os.path.join(BASE_PATH, 'dataset', 'content_cut.txt'), 'w', encoding='utf-8') as f:
    CONTENT_CUT = f.write(str(CONTENT_CUT))

In [5]:
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

### 参数说明：

* size - 词向量维度
* workers - 并行
* min_count - 忽略词频小于该值的词

### LineSentence

按行读取文件中的每一行

In [None]:
model = Word2Vec(LineSentence(os.path.join(BASE_PATH,'dataset', 'content_cut.txt')), size=256, workers=cores, min_count=200)

In [None]:
## 模型保存
model.wv.save_word2vec_format(os.path.join(BASE_PATH, 'model', 'news_zhwiki_embedding_256.model'), binary=False)
model.wv.save_word2vec_format(os.path.join(BASE_PATH, 'model', 'news_zhwiki_embedding_256.bin'), binary=True)   

## 模型测试

In [22]:
vector_pingguo = model.wv["苹果"]
vector_pingguo

array([ 0.7308604 ,  0.06678491,  1.2588391 ,  0.16481721, -0.63448566,
        0.43856406, -0.0822095 , -0.674922  ,  1.0805345 , -1.9112058 ,
       -0.07400806,  1.3579844 , -2.1278512 , -1.293433  ,  0.3704333 ,
       -0.85180044,  0.96438485,  1.3644834 , -1.247701  ,  0.30079755,
       -0.27265286,  0.23435226, -0.81974405,  0.44991964,  1.0575411 ,
       -0.27984273, -0.6317437 , -0.9596026 ,  0.5738414 , -0.5049172 ,
       -2.062391  ,  0.21115862,  0.10938394,  0.6258525 , -0.39978066,
        0.8156311 ,  0.271734  ,  2.3465235 ,  1.0984751 ,  0.7869562 ,
        0.23546292, -0.14878681, -0.16011983, -1.8387194 ,  1.2124068 ,
        0.5921118 , -0.00590605, -0.27176684,  0.82037085, -2.0929854 ,
       -1.778485  ,  1.415078  , -0.86132956, -1.5505913 ,  0.09460568,
       -0.7897975 , -2.3605618 , -0.31535026, -0.04194674, -2.7027864 ,
       -0.04226318,  0.1827592 , -0.29446995,  1.7024041 ,  0.09945752,
       -1.776401  ,  0.4678805 , -1.6353947 ,  0.17412174, -1.82

In [23]:
# 相似词
model.most_similar('苹果')

  


[('苹果公司', 0.6948990821838379),
 ('黑莓', 0.6625807881355286),
 ('微软', 0.6201909780502319),
 ('苹果电脑', 0.5971246957778931),
 ('Google', 0.5907295942306519),
 ('iPhone', 0.5902291536331177),
 ('小米', 0.5899524688720703),
 ('手机', 0.5792343616485596),
 ('iPod', 0.5784551501274109),
 ('Apple', 0.5772913694381714)]

In [31]:
model.most_similar('说')


Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).



[('说道', 0.746940016746521),
 ('却说', 0.7088226079940796),
 ('问道', 0.6997306942939758),
 ('问', 0.680905818939209),
 ('反问', 0.6686925292015076),
 ('告诉', 0.6685142517089844),
 ('指出', 0.6652007102966309),
 ('写道', 0.6560171842575073),
 ('答道', 0.6529855728149414),
 ('声称', 0.6508760452270508)]

In [25]:
# 判断不属于同一类型的词
model.wv.doesnt_match(['空气', '水', '地球', '汽车'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'汽车'

## 可视化

In [26]:
%matplotlib inline

In [27]:
from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import TSNE
import numpy as np 

In [28]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    
    for i, word in enumerate(model.wv.vocab):
        vectors.append(model.wv[word])
        labels.append(word)
        if i == 500:
            break

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels
x_vals, y_vals, labels = reduce_dimensions(model)    

In [29]:
def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

In [30]:
plot_function(x_vals, y_vals, labels)