# Word2Vec Unigram Testing

This Python Notebook is used for evaluation of the Word2Vec Unigram model. The section is broken down as follows:

- Find most similar words from the selected word
- Perform Syntactic Analysis
- Perform Semantic Analysis
- Find uncommon word among a list of words
- Find cosine similarity among two words
- Find the frequency count of a word
- Check if a word is in the model
- Print preview a list of words
- Others (Vector space size)

In [1]:
from gensim.models import Word2Vec as w2v



In [2]:
# Load Unigram model
FILE = "C:/Users/MyPC/Desktop/FYP/W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)

In [3]:
# Cell to find most similar words 
# One word for unigram: dragon, bleach, tottenham
# Two words for bigram: dragon_ball, barack_obama (UNDERSCORE NEEDED + BIGRAM MODEL LOADED)
model.most_similar("neuropsychopharmacology", topn=20)

[('biopsychology', 0.740115225315094),
 ('astrochemistry', 0.7391058206558228),
 ('neuroendocrinologist', 0.7296165227890015),
 ('nanoscience', 0.7265405058860779),
 ('neuropharmacology', 0.7247588038444519),
 ('saltzberg', 0.7157706618309021),
 ('ethnomusicology', 0.7156946659088135),
 ('psychobiology', 0.7154250144958496),
 ('nueroscience', 0.7147186994552612),
 ('neuropsychiatry', 0.7140935659408569),
 ('ichthyology', 0.710540235042572),
 ('molbio', 0.7056220769882202),
 ('oenology', 0.7056138515472412),
 ('antropology', 0.7041956186294556),
 ('biopsych', 0.7037904858589172),
 ('neuroengineering', 0.7037561535835266),
 ('nanoengineering', 0.7024978995323181),
 ('psycholinguistics', 0.7002543210983276),
 ('bioanthropology', 0.6995773315429688),
 ('christmann', 0.698868989944458)]

In [4]:
# Cell for semantic evaluation (Ex. King - man + woman is approximately equal to queen)
model.most_similar(positive=["tokyo","malaysia"], negative=["japan"])

[('lumpur', 0.6737101674079895),
 ('kuala', 0.6668090224266052),
 ('taipei', 0.6401477456092834),
 ('bangkok', 0.6113026142120361),
 ('penang', 0.5809809565544128),
 ('lampur', 0.5752942562103271),
 ('toyko', 0.5550657510757446),
 ('selangor', 0.5511509776115417),
 ('singapore', 0.5502724647521973),
 ('mumbai', 0.5481346249580383)]

In [5]:
# Cell for syntactic evaluation (Ex. walking - walk + swim is approximately equal to swimming)
model.most_similar(positive=["greenish","blue"], negative=["green"])

[('blueish', 0.7298511266708374),
 ('greyish', 0.7232707738876343),
 ('bluish', 0.7149738669395447),
 ('pinkish', 0.705883264541626),
 ('purplish', 0.7028074264526367),
 ('brownish', 0.6946163773536682),
 ('grayish', 0.6922476887702942),
 ('reddish', 0.6911346316337585),
 ('yellowish', 0.6770833134651184),
 ('whitish', 0.6669460535049438)]

In [6]:
# Cell to check which word doesn't match among a group of words
model.doesnt_match("blue green yellow apple".split())

'apple'

In [7]:
# Cell to check similarity among two words
model.similarity("titanic","rose")

0.24046405589195533

In [8]:
# Count number of times a specific word occured in the 2015 Dataset
word = model.vocab['difu']
type(word.count)

int

In [9]:
# Check if word (Unigram) is in model. It is case-sensitive
'Dragon' in model

False

In [10]:
# A brief review of words in the model
count = 70

for index, word in enumerate(model.vocab):
    print(index, word, model.vocab[word].count)
    if index == count:
        break

0 cockweasel 708272
1 frakken 449808
2 smirfs 428731
3 wiine 853778
4 rellying 753562
5 medsbuy 753745
6 kirah 642904
7 loltumblr 1027250
8 confutation 86362
9 desarmado 49277
10 whicy 268221
11 weekending 733123
12 mactaggart 193721
13 caprisun 1003736
14 studierte 202535
15 dourtmund 137290
16 namors 626101
17 gingered 700697
18 urzak 346864
19 alck 366608
20 hronis 322693
21 depraved 1120032
22 perichondrium 157468
23 tumakbo 799760
24 ximum 550855
25 bantam 1087697
26 fullon 866146
27 cachete 724355
28 nakakaiyak 347117
29 stoneberry 443653
30 parallelisms 855916
31 fueling 1122857
32 eliminiation 405126
33 budzilla 354658
34 divestitures 729040
35 badmannered 475022
36 celticsblog 720527
37 windtalkers 933967
38 fururama 700825
39 inevara 118029
40 ertragswert 64683
41 giffingtool 112915
42 dingens 93462
43 sommerdamm 674304
44 dickade 55306
45 conceration 23629
46 ujarnya 188641
47 iphy 359343
48 receptiveness 1019798
49 microcube 754417
50 proudmanlet 159038
51 sysinstall 78281


In [11]:
# Looking under the hood of Word2vec
# Use this cell as tips for K-Means Clustering
# Motivation: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors

# Feature vector for each word(s) 
#print(model.syn0[0])

# Shape of the vocabulary
#print(model.syn0.shape)

# Get list of all keys (words)
#model.index2word[2000:3000]

In [68]:
# Do preliminary of K-Means testing using the first (1000,2000,4000) words using 250 clusters
from sklearn.cluster import KMeans
import time

# Specify the number of words and clusters
WORDS = 5000
CLUSTERS = 250

# Get the word vectors and the word
word_vectors = model.syn0[:WORDS]
words = model.index2word[:WORDS]

# Initialize K-Means
k_means = KMeans( n_clusters = CLUSTERS )

# Fit the model, get the centroid number and calculate time
start = time.time()
idx = k_means.fit_predict(word_vectors)
end = time.time()

print("TIME TAKEN: ", end-start)

TIME TAKEN:  19.578624963760376


In [69]:
# Create a Word / Index dictionary
# Each vocabulary word is matched to a cluster center

word_centroid_map = dict(zip( words, idx ))

type(word_centroid_map)

dict

In [70]:
# Loop through the top N clusters

N = 100

for cluster in range(0, N):
    
    #Create array of words
    words = []
    
    for word, cluster_num in word_centroid_map.items():
        if cluster_num == cluster:
            words.append(word)
    
    print("CLUSTER NUMBER: %i" % (cluster))
    print("WORDS: ", words)
    print("\n\n")
        

CLUSTER NUMBER: 0
WORDS:  ['references', 'commentary', 'romantic', 'sarcasm', 'vibe', 'comedy', 'dialogue', 'tone', 'jokes', 'puns', 'joke', 'humor', 'audience', 'entertainment', 'fashion', 'romance']



CLUSTER NUMBER: 1
WORDS:  ['repeatedly', 'proven', 'said', 'confirmed', 'publicly', 'committed', 'cheated', 'lied', 'claimed', 'admitted', 'admit']



CLUSTER NUMBER: 2
WORDS:  ['group', 'army', 'leadership', 'join', 'commander', 'leader', 'crew', 'alliance', 'captain', 'chief', 'squad', 'navy', 'clan', 'gang', 'soldier', 'members', 'member', 'leaders', 'allies', 'joining', 'troops', 'council', 'soldiers']



CLUSTER NUMBER: 3
WORDS:  ['obvious', 'generic', 'practical', 'common', 'universal', 'simple', 'easy', 'genuine', 'plain', 'complex', 'detailed', 'handy', 'pure', 'nicely', 'basic', 'quick', 'clear', 'broad', 'subtle']



CLUSTER NUMBER: 4
WORDS:  ['giving', 'being', 'making', 'becoming', 'returning', 'starting', 'picking', 'getting', 'ing', 'providing', 'taking', 'doing', 'bringi

In [67]:
# Find cluster number for a specific word
word_centroid_map['motherfucker']

662