**Purpose**: Fit data to word2vec model and use tnse to reduce word2vec model to 2D so we can plot the vector in the next step.

In [1]:
import os
import time
import pickle
import multiprocessing
import gensim.models.word2vec as w2v
import sklearn.manifold
from sklearn import decomposition
import numpy as np
#from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib
%matplotlib notebook
import matplotlib.pylab as plt
from matplotlib.font_manager import FontProperties
import seaborn as sns

In [2]:
base_path = 'C:/Users/sunny/Desktop/news_nlp/'
#publisher='立場報道'
publisher='星島日報'
os.chdir(base_path)

In [3]:
# load sent using pickle
sent_pkl = os.path.join(publisher + '_sent.pkl')

with open(sent_pkl, "rb") as f:   # Unpickling
    sent= pickle.load(f)

In [4]:
token_count = sum([len(s) for s in sent])
print(f'Token count: {token_count}')

Token count: 4832477


In [5]:
# word2vec documentation: https://radimrehurek.com/gensim/models/word2vec.html
#define hyperparameters

start = time.perf_counter()

# Dimensionality of the resulting word vectors.
#more dimensions mean more traiig them, but more generalized
num_features = 300

#
# Minimum word count threshold.
min_word_count = 10

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 10

# Downsample setting for frequent words.
#rate 0 and 1e-5
#how often to use
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

#model
article_word2vec = w2v.Word2Vec(
    sg=1, # Training algorithm: 1 for skip-gram; otherwise CBOW.
    seed=seed,
    workers=num_workers,
    vector_size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)


article_word2vec.build_vocab(corpus_iterable = sent)
print("Word2Vec vocabulary length:", len(article_word2vec.wv))
    # https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#4-vocab-dict-became-key_to_index-for-looking-up-a-keys-integer-index-or-get_vecattr-and-set_vecattr-for-other-per-key-attributes
    #rock_idx = model.wv.vocab["rock"].index  # 🚫
    #rock_cnt = model.wv.vocab["rock"].count  # 🚫
    #vocab_len = len(model.wv.vocab)  # 🚫

    #rock_idx = model.wv.key_to_index["rock"]
    #rock_cnt = model.wv.get_vecattr("rock", "count")  # 👍
    #vocab_len = len(model.wv)  # 👍
    
        
#train model
article_word2vec.train(corpus_iterable =sent,
                       total_examples=article_word2vec.corpus_count,
                       #total_words = len(article_word2vec.wv),
                       epochs=10)
#save model
article_word2vec.save(publisher + '_word2vec.w2v')

finish = time.perf_counter()
print(f'Finished fitting data to model in {round(finish-start, 2)} secound(s)')

Word2Vec vocabulary length: 26890
Finished fitting data to model in 110.32 secound(s)


In [8]:
# Load the model from wd.
model= w2v.Word2Vec.load(publisher + '_word2vec.w2v')

In [9]:
# Reduce word2vec vectors to 2D using tsne, so we can plot them in the next step.

def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    two_dim_vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in two_dim_vectors]
    y_vals = [v[1] for v in two_dim_vectors]
    
    df = pd.DataFrame(list(zip(labels,x_vals,y_vals)), columns=['token', 'x', 'y'])
    return df

In [10]:
start = time.perf_counter()
df = reduce_dimensions(model)
finish = time.perf_counter()
print(f'Finished dimension reduction in {round(finish-start, 2)} secound(s)')

tnse_result = os.path.join(base_path, publisher + '_2d_tsne.pkl')

with open(tnse_result, "wb") as f:   #Pickling
    pickle.dump(df, f)

Finished dimension reduction in 420.75 secound(s)


In [11]:
tnse_result = os.path.join(base_path, publisher + '_2d_tsne.pkl')
with open(tnse_result, "rb") as f:   # Unpickling
    df= pickle.load(f)

In [12]:
df.head(8)

Unnamed: 0,token,x,y
0,，,2.282656,-3.063334
1,,3.583279,-3.037843
2,的,1.290142,-3.62856
3,「,-5.782082,10.856745
4,在,3.714983,-1.637922
5,、,1.937181,-4.611476
6,」,-5.773089,10.871822
7,及,1.829747,-4.265761
