# Word2Vec Unigram Testing

This Python Notebook is used for evaluation of the Word2Vec Unigram model. The section is broken down as follows:

- Find most similar words from the selected word
- Perform Syntactic Analysis
- Perform Semantic Analysis
- Find uncommon word among a list of words
- Find cosine similarity among two words
- Find the frequency count of a word
- Check if a word is in the model
- Feature vectors of a certain word
- Visualisation of words in Vector Space using TSNE

In [1]:
from gensim.models import Word2Vec as w2v



In [2]:
# Load Unigram model
FILE = "C:/Users/MyPC/Desktop/FYP/W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)

In [3]:
# Cell to find most similar words 
# One word for unigram: dragon, bleach, tottenham
# Two words for bigram: dragon_ball, barack_obama (UNDERSCORE NEEDED + BIGRAM MODEL LOADED)
model.most_similar("neuropsychopharmacology", topn=20)

[('biopsychology', 0.740115225315094),
 ('astrochemistry', 0.7391058206558228),
 ('neuroendocrinologist', 0.7296165227890015),
 ('nanoscience', 0.7265405058860779),
 ('neuropharmacology', 0.7247588038444519),
 ('saltzberg', 0.7157706618309021),
 ('ethnomusicology', 0.7156946659088135),
 ('psychobiology', 0.7154250144958496),
 ('nueroscience', 0.7147186994552612),
 ('neuropsychiatry', 0.7140935659408569),
 ('ichthyology', 0.710540235042572),
 ('molbio', 0.7056220769882202),
 ('oenology', 0.7056138515472412),
 ('antropology', 0.7041956186294556),
 ('biopsych', 0.7037904858589172),
 ('neuroengineering', 0.7037561535835266),
 ('nanoengineering', 0.7024978995323181),
 ('psycholinguistics', 0.7002543210983276),
 ('bioanthropology', 0.6995773315429688),
 ('christmann', 0.698868989944458)]

In [4]:
# Cell for semantic evaluation (Ex. King - man + woman is approximately equal to queen)
model.most_similar(positive=["tokyo","malaysia"], negative=["japan"])

[('lumpur', 0.6737101674079895),
 ('kuala', 0.6668090224266052),
 ('taipei', 0.6401477456092834),
 ('bangkok', 0.6113026142120361),
 ('penang', 0.5809809565544128),
 ('lampur', 0.5752942562103271),
 ('toyko', 0.5550657510757446),
 ('selangor', 0.5511509776115417),
 ('singapore', 0.5502724647521973),
 ('mumbai', 0.5481346249580383)]

In [5]:
# Cell for syntactic evaluation (Ex. walking - walk + swim is approximately equal to swimming)
model.most_similar(positive=["greenish","blue"], negative=["green"])

[('blueish', 0.7298511266708374),
 ('greyish', 0.7232707738876343),
 ('bluish', 0.7149738669395447),
 ('pinkish', 0.705883264541626),
 ('purplish', 0.7028074264526367),
 ('brownish', 0.6946163773536682),
 ('grayish', 0.6922476887702942),
 ('reddish', 0.6911346316337585),
 ('yellowish', 0.6770833134651184),
 ('whitish', 0.6669460535049438)]

In [6]:
# Cell to check which word doesn't match among a group of words
model.doesnt_match("blue green yellow apple".split())

'apple'

In [7]:
# Cell to check similarity among two words
model.similarity("titanic","rose")

0.24046405589195533

In [8]:
# Count number of times a specific word occured in the 2015 Dataset
word = model.vocab['difu']
type(word.count)

int

In [9]:
# Check if word (Unigram) is in model. It is case-sensitive
'Dragon' in model

False

In [19]:
# What does each word actually contain?
model['goku']

array([ 0.10375531,  0.10929207,  0.08830469,  0.01395523, -0.01517171,
        0.00501317, -0.06046121,  0.02743253,  0.02918806,  0.03573068,
       -0.01970761, -0.01276298, -0.02046086,  0.03802621,  0.00047961,
       -0.0017624 , -0.06191385, -0.05035397,  0.08013493, -0.05448005,
       -0.04873396,  0.01874197, -0.06683715, -0.07103707, -0.01683106,
        0.0146693 , -0.06659026, -0.01451557, -0.02147384,  0.00971734,
       -0.03524466, -0.08246868, -0.12978971, -0.04906649, -0.04533923,
        0.13329028,  0.0016056 ,  0.02454429,  0.07700373,  0.03213641,
       -0.03062899,  0.07740682,  0.15522739, -0.00674924, -0.05913823,
       -0.01185566, -0.0377803 , -0.03675865, -0.00894851,  0.06692838,
       -0.04187158,  0.05098331, -0.06255441,  0.03872509, -0.04266021,
        0.08426074,  0.02840355, -0.06466481, -0.04299739, -0.04460176,
        0.01159275, -0.03719337, -0.0315008 , -0.04088448, -0.0624588 ,
       -0.14078555,  0.0181028 ,  0.00725325,  0.13781574,  0.13

In [11]:
# Visualisation (Normal) using TSNE
# Motivation: http://lvdmaaten.github.io/tsne/
# Video: https://www.youtube.com/watch?v=RJVL80Gg3lA

# Firstly: Import the libraries
from sklearn.manifold import TSNE

import seaborn as sns
import matplotlib.pyplot as plt
import mpld3

sns.set_style("whitegrid")

%matplotlib inline
mpld3.enable_notebook()

In [32]:
# Create function to return list of words and word embeddings
import random
import pickle

def getEmbeddings(cluster_file, word):
    
    # Specify path
    FILE_DICT = "C:/Users/MyPC/Desktop/FYP/Word Dictionaries/dict_" + str(cluster_file) + "C.pk"
    FILE_CLUS = "C:/Users/MyPC/Desktop/FYP/K-Means Models/full_" + str(cluster_file) + "C.pk"
    
    # Load the files using pickle
    array_dict_cluster = pickle.load(open(FILE_DICT, "rb"))
    word_centroid_map =  pickle.load(open(FILE_CLUS,"rb"))
    
    # Find index number of word 
    # Then load all related words 
    cluster_num = word_centroid_map[word]
    words_list = array_dict_cluster[cluster_num]['word_list']
    
    # Initialize array of vectors and words
    vectors = []
    words = []
    
    # Add vector and words
    for word in words_list:
        
        vectors.append(model[word]) 
        words.append(word)
    
    return vectors, words

In [33]:
# Display the graph in this cell

# Get the feature vectors and respective words
wv, vocabulary = getEmbeddings(500, 'douche')

# Initialize TSNE model
tsne = TSNE(n_components=2, random_state=0)

# Fit with TSNE
Y = tsne.fit_transform(wv)

# Scatter points
fig, ax = plt.subplots(figsize=(10, 10))

# Use Scatterplot
ax.scatter(Y[:, 0], Y[:, 1], facecolors='none', edgecolors='none')

# Initialize Points
for label, x, y in zip(vocabulary, Y[:, 0], Y[:, 1]):
    ax.annotate(label, xy=(x, y), fontsize=12)

# Display
mpld3.display(fig)

[array([ 0.05145661, -0.04685003, -0.02879846,  0.05666239, -0.04753609,
        0.1114806 ,  0.04876652,  0.07109757, -0.04892872, -0.04315237,
        0.02755819,  0.05020294,  0.06086378, -0.01833415, -0.00875045,
       -0.00238665,  0.02993001,  0.099438  ,  0.04831645, -0.05579807,
        0.04440744,  0.02080141, -0.09219721, -0.06192939,  0.1180205 ,
        0.10844494,  0.04951482, -0.08195455, -0.00548681,  0.00255469,
        0.03979417,  0.03767009,  0.08018209, -0.02175106,  0.01817918,
       -0.04942589,  0.06746624, -0.0315291 , -0.09475919,  0.02801306,
       -0.05078059, -0.06073819,  0.06431818, -0.06460325,  0.04748455,
        0.05133438, -0.04332846,  0.00708653, -0.07569961, -0.01522716,
       -0.02265458, -0.06032051,  0.01874126,  0.03706908,  0.04416848,
       -0.02276376, -0.05126509, -0.06920522,  0.00936837,  0.08457718,
        0.00056303,  0.01591968, -0.00258052, -0.01365134, -0.08791272,
       -0.01390034, -0.02453973, -0.02285363,  0.04097067, -0.0