# Word2Vec Unigram Testing

This Python Notebook is used for evaluation of the Word2Vec Unigram model. The section is broken down as follows:

- Find most similar words from the selected word
- Perform Syntactic Analysis
- Perform Semantic Analysis
- Find uncommon word among a list of words
- Find cosine similarity among two words
- Find the frequency count of a word
- Check if a word is in the model
- Feature vectors of a certain word
- Visualisation of words in Vector Space using TSNE
- Histogram to showcase distribution of words

In [None]:
from gensim.models import Word2Vec as w2v

In [None]:
# Load Unigram model
FILE = "C:/Users/MyPC/Desktop/Vegito/W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)

In [None]:
# Cell to find most similar words 
# One word for unigram: dragon, bleach, tottenham
# Two words for bigram: dragon_ball, barack_obama (UNDERSCORE NEEDED + BIGRAM MODEL LOADED)
model.most_similar("neuropsychopharmacology", topn=20)

In [None]:
# Cell for semantic evaluation (Ex. King - man + woman is approximately equal to queen)
model.most_similar(positive=["tokyo","malaysia"], negative=["japan"])

In [None]:
# Cell for syntactic evaluation (Ex. walking - walk + swim is approximately equal to swimming)
model.most_similar(positive=["greenish","blue"], negative=["green"])

In [None]:
# Cell to check which word doesn't match among a group of words
model.doesnt_match("blue green yellow apple".split())

In [None]:
# Cell to check similarity among two words
model.similarity("squats","legpress")

In [None]:
# Count number of times a specific word occured in the 2015 Dataset
word = model.vocab['difu']
type(word.count)

In [None]:
# Check if word (Unigram) is in model. It is case-sensitive
'Dragon' in model

In [None]:
# What does each word actually contain?
model['goku']

In [None]:
# Visualisation (Normal) using TSNE and PCA
# Motivation: http://lvdmaaten.github.io/tsne/
# Motivation: https://golog.co/blog/article/Visualising_high-dimensional_datasets_using_PCA_and_tSNE
# Video: https://www.youtube.com/watch?v=RJVL80Gg3lA

# Firstly: Import the libraries
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import mpld3

sns.set_style("white")

%matplotlib inline

In [None]:
# Create function to return list of words and word embeddings
import random
import pickle

def getEmbeddings(cluster_file, N, word):
    
    # Specify path
    FILE_DICT = "C:/Users/MyPC/Desktop/Vegito/Word Dictionaries/dict_" + str(cluster_file) + "C.pk"
    FILE_CLUS = "C:/Users/MyPC/Desktop/Vegito/K-Means Models/full_" + str(cluster_file) + "C.pk"
    
    # Load the files using pickle
    array_dict_cluster = pickle.load(open(FILE_DICT, "rb"))
    word_centroid_map =  pickle.load(open(FILE_CLUS,"rb"))
    
    # Find index number of word 
    # Then load all related words 
    cluster_num = word_centroid_map[word]
    words_list = array_dict_cluster[cluster_num]['word_list']
    
    # Get index number of searched word
    index_num = words_list.index(word)
    print("INDEX NUMBER: %i" % (index_num))
    
    # Memory limitations
    if index_num < N:
        
        index_num = N
    
    # Lets get only the first N number of words
    words_list = words_list[:index_num + 10]
    
    # Initialize array of vectors and words
    vectors = []
    words = []
    
    # Add vector and words
    for word in words_list:
        
        vectors.append(model[word]) 
        words.append(word)
    
    return vectors, words

In [None]:
# Display the graph in this cell
import time

# Get the feature vectors and respective words
search_word = 'dickbag'.lower()
get_words = 1200

wv, vocabulary = getEmbeddings(500, get_words, search_word)

print('SEARCHED WORD: %s' % (search_word))
print("TOTAL WORDS: %i " % (len(vocabulary)))

# Initialize PCA model
pca = PCA(n_components=150)

start = time.time()
pca_result = pca.fit_transform(wv)
end = time.time()

print("TIME TAKEN (PCA): ", end-start)

# Get explained variance ratio
explain_ratio = np.sum(pca.explained_variance_ratio_)
print('EXPLAINED VARIANCED RATIO: ', explain_ratio)

# Initialize TSNE model
tsne = TSNE(n_components=2, random_state=0)

# Fit with TSNE
start = time.time()
Y = tsne.fit_transform(pca_result)
end = time.time()

print("TIME TAKEN (TSNE): ", end - start)

# Scatter points
fig, ax = plt.subplots(figsize=(10, 8),subplot_kw={'xticks': [], 'yticks': []})

# Use Scatterplot
ax.scatter(Y[:, 0], Y[:, 1], color="blue")

# Initialize Points
for label, x, y in zip(vocabulary, Y[:, 0], Y[:, 1]):
    
    # Give the searched word a different color
    # Otherwise, all words should be colored red
    color = 'black'
    fontsize = 10
    
    if label == search_word:
        color = 'red'
        fontsize = 20
        
    ax.annotate(label, xy=(x, y), fontsize=fontsize, color=color)

# Display
mpld3.display(fig)

In [None]:
# Function to plot the histogram of word distribution

def plotHistogram(file_cluster):
    
    FILE_DICT = "C:/Users/MyPC/Desktop/Vegito/Word Dictionaries/dict_" + str(file_cluster) + "C.pk"
    array_dict_cluster = pickle.load(open(FILE_DICT, "rb"))

    word_length = []

    # Loop cluster by cluster
    for cluster in array_dict_cluster:

        # Get total words
        total_words = len(cluster['word_list'])

        # Append
        word_length.append(total_words)

    # Plot Histogram
    PADDING = 15

    sns.set(rc={"figure.figsize": (6,6)})
    sns.set_style("white")
    sns.set_style("ticks")
    sns.set_context("notebook", font_scale=1)

    ax = sns.distplot(word_length, kde=False, color='purple')

    ax.grid(False)
    ax.set(title='Words Distribution in '+ str(file_cluster) + ' Clusters')

    plt.xlabel("Total Words", labelpad=PADDING)
    plt.ylabel("Total Clusters", labelpad=PADDING)

In [None]:
# Call histogram function plot the histograms

mpld3.disable_notebook()
# Array to store number of clusters
clust_array = [250,500]

# Go one by one
for clusters in clust_array:
    
    # Call histogram function
    plotHistogram(clusters)
    sns.despine()
    plt.show()