In [1]:
!pip install gensim
!pip install nltk
!pip install scipy==1.10.1
!pip install pandas

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("https://raw.githubusercontent.com/weimenglee/NLP-demos/refs/heads/main/IMDB%20Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
import nltk

# download the Punkt Sentence Tokenizer Models from the Natural Language Toolkit (NLTK) library
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/weimenglee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/weimenglee/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Performing Word Vector Embeddings

Word2Vec is a popular technique for learning vector representations of words, developed by Tomas Mikolov and his team at Google in 2013. It converts words into dense numerical vectors that capture semantic relationships between words.

In [4]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# sample corpus
corpus = df['review']

# tokenizing the corpus (word tokenization)
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

model = Word2Vec(
    sentences = tokenized_corpus,  # List of tokenized sentences (each sentence is a list of tokens)
    vector_size = 5,               # The dimension of the word vectors (embedding size)
    window = 2,                    # The maximum distance between the current and predicted word within a sentence; A window of 2 means that the model will look at the 2 words to the left and 2 words to the right of the target word to predict surrounding words. For example, for the sentence "The cat sat on the mat" with a window of 2, the word "sat" will be predicted using "The", "cat", "on", and "the".
    min_count = 1,                 # Ignores all words with a total frequency lower than this
    sg = 1                         # Skip-gram model (if 1) or Continuous Bag of Words (CBOW) model (if 0)
)

model.save('wordembeddings')

In [5]:
tokenized_corpus  # view all the tokens

[['one',
  'of',
  'the',
  'other',
  'reviewers',
  'has',
  'mentioned',
  'that',
  'after',
  'watching',
  'just',
  '1',
  'oz',
  'episode',
  'you',
  "'ll",
  'be',
  'hooked',
  '.',
  'they',
  'are',
  'right',
  ',',
  'as',
  'this',
  'is',
  'exactly',
  'what',
  'happened',
  'with',
  'me.',
  '<',
  'br',
  '/',
  '>',
  '<',
  'br',
  '/',
  '>',
  'the',
  'first',
  'thing',
  'that',
  'struck',
  'me',
  'about',
  'oz',
  'was',
  'its',
  'brutality',
  'and',
  'unflinching',
  'scenes',
  'of',
  'violence',
  ',',
  'which',
  'set',
  'in',
  'right',
  'from',
  'the',
  'word',
  'go',
  '.',
  'trust',
  'me',
  ',',
  'this',
  'is',
  'not',
  'a',
  'show',
  'for',
  'the',
  'faint',
  'hearted',
  'or',
  'timid',
  '.',
  'this',
  'show',
  'pulls',
  'no',
  'punches',
  'with',
  'regards',
  'to',
  'drugs',
  ',',
  'sex',
  'or',
  'violence',
  '.',
  'its',
  'is',
  'hardcore',
  ',',
  'in',
  'the',
  'classic',
  'use',
  'of',
  't

In [6]:
len(model.wv.index_to_key)    # number of tokens

164024

In [7]:
model.wv.index_to_key    # list of tokens

['the',
 ',',
 '.',
 'and',
 'a',
 'of',
 'to',
 'is',
 '/',
 '>',
 '<',
 'br',
 'it',
 'in',
 'i',
 'this',
 'that',
 "'s",
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 ')',
 '(',
 'you',
 'on',
 "''",
 "n't",
 '``',
 'not',
 'are',
 'he',
 'his',
 'have',
 'be',
 'one',
 '!',
 'at',
 'all',
 'they',
 'by',
 'an',
 'who',
 'from',
 'so',
 'like',
 'there',
 'or',
 'just',
 'do',
 'her',
 'about',
 'if',
 'has',
 'out',
 '?',
 'what',
 'some',
 'good',
 'when',
 'more',
 'very',
 'she',
 'would',
 'my',
 'even',
 'no',
 'up',
 'can',
 'time',
 'which',
 'only',
 'really',
 'their',
 'see',
 'had',
 'story',
 'were',
 'we',
 'did',
 'me',
 'does',
 "'",
 '...',
 ':',
 '-',
 'than',
 'well',
 'much',
 'could',
 'been',
 'get',
 'other',
 'will',
 'people',
 'great',
 'also',
 'bad',
 'into',
 'because',
 'how',
 'most',
 'first',
 'him',
 'its',
 'then',
 'them',
 'made',
 'make',
 'way',
 'any',
 'too',
 'after',
 'movies',
 'think',
 'characters',
 '*',
 'watch',
 'chara

# Getting the Embedding Vector for a Word

In [8]:
word_vector = model.wv['world']
print("Vector representation of 'world':", word_vector)

Vector representation of 'world': [-0.49751934  1.7134776   1.1963094  -0.8027173  -0.16114073]


# Calculating the Similarity of Two Words

In [9]:
word1_vector = model.wv['great']
word2_vector = model.wv['bad']

print(word1_vector)
print(word2_vector)

[-0.5035431  1.2375115  1.1941214 -1.586677   0.968147 ]
[-0.36227345  1.4448265   1.4903257  -1.5862831   1.4975581 ]


## Method 1: Cosine Similarity - high values indicates greater similarity

In [10]:
cosine_similarity = np.dot(word1_vector, word2_vector) / \
                    (np.linalg.norm(word1_vector) * 
                     np.linalg.norm(word2_vector))
print("Cosine Similarity:", cosine_similarity)

Cosine Similarity: 0.98547834


The cosine similarity for 'great' and 'bad' is 0.985. This high value suggests that, 
in the __context of the IMDB dataset__, these two words are deemed to be very similar, 
which may not align with their typical meanings in sentiment analysis.

Cosine similarity has the following range:
- Max value of 1 - This occurs when two vectors point in the exact same
  direction, indicating they are identical in orientation and, therefore,
  as similar as possible.
- Minimum of -1 - This occurs when two vectors point in completely
  opposite directions in the embedding space, indicating that they are
  contextually dissimilar or even oppositely related in meaning.

![image.png](attachment:68f8dfb6-c8b9-4ce9-b66d-66fd05cefbc5.png)

Cosine similarity is a specific metric used to measure similarity between two vectors. It calculates the cosine of the angle between them:

![image.png](attachment:6e6bbcd8-8834-47a4-bc74-99566a1ba238.png)

## Method 2: Euclidean Distance - lower value indicates close proximity

In [11]:
euclidean_distance = np.linalg.norm(word1_vector - word2_vector)
print("Euclidean Distance:", euclidean_distance)

Euclidean Distance: 0.65646774


Euclidean distance is a measure of the straight-line distance between two points 
in Euclidean space. In the context of vector embeddings, it quantifies how far 
apart two vectors (representing words, for example) are in a multi-dimensional space
The Euclidean distance between 'great' and 'bad' is 0.649 (the smaller the 
number, the "closer" they are to each other).
Maximum value is infinity.

In [12]:
word1_vector = model.wv['really']
word2_vector = model.wv['edutainment']

cosine_similarity = np.dot(word1_vector, word2_vector) / \
                    (np.linalg.norm(word1_vector) * 
                     np.linalg.norm(word2_vector))   

print("Cosine Similarity:", cosine_similarity)

euclidean_distance = np.linalg.norm(word1_vector - word2_vector)
print("Euclidean Distance:", euclidean_distance)

Cosine Similarity: 0.11630334
Euclidean Distance: 3.1296313


# Finding Similar Words

In [13]:
model.wv.most_similar('boring', topn=5)

[('enjoyable', 0.9986045360565186),
 ('unsatisfying', 0.9984149932861328),
 ('entertaining', 0.9976674914360046),
 ('depressing', 0.9973338842391968),
 ('horrible', 0.9972745776176453)]

The high similarity score of 0.9988 for "boring" and "enjoyable" means that these two words are highly semantically related. This might seem counterintuitive because "boring" and "enjoyable" are opposite in meaning, but the model has likely learned that the terms can often appear in similar contexts, such as in phrases where both describe the quality of an experience (e.g., "The movie was boring, but I enjoyed it").

When Word2Vec creates vectors where semantically similar words are close together, it means words with related meanings end up near each other in the mathematical space. For example:

- Words like "dog," "cat," "horse" (animals) cluster together
- Words like "run," "walk," "sprint" (movement verbs) cluster together
- Words like "happy," "sad," "angry" (emotions) cluster in the same region

# Using Gradio

In [14]:
!pip install gradio



In [15]:
def most_similar_words(word):
    words = model.wv.most_similar(word, topn=5)
    return words

import gradio as gr
gr.Interface(
    most_similar_words,
    inputs = "text",
    outputs = "text",    
).launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




# Visualizing the Word Vectors

In [None]:
!pip install matplotlib
!pip install scikit-learn

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import nltk

df = pd.read_csv("https://raw.githubusercontent.com/weimenglee/NLP-demos/refs/heads/main/IMDB%20Dataset.csv")
nltk.download('punkt')

# just use the first 2 reviews
corpus = df['review'][:2]

# tokenizing the corpus
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

# training the Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=5, window=2, min_count=1, sg=1)
print(len(model.wv.index_to_key))

# get the words from the model's vocabulary
words = list(model.wv.index_to_key)

# extract the word vectors
word_vectors = np.array([model.wv[word] for word in words])

# Step 1: Reduce dimensionality using t-SNE to 3D
tsne = TSNE(n_components=3, random_state=0)
word_vectors_3d = tsne.fit_transform(word_vectors)

# Step 2: Plot the words in 3D space
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(word_vectors_3d[:, 0], word_vectors_3d[:, 1], word_vectors_3d[:, 2])

# annotate each point with the corresponding word
for i, word in enumerate(words):
    ax.text(word_vectors_3d[i, 0], word_vectors_3d[i, 1], word_vectors_3d[i, 2], word, fontsize=9)

# set plot titles and labels
ax.set_title("3D Visualization of Word Embeddings")
ax.set_xlabel("t-SNE Component 1")
ax.set_ylabel("t-SNE Component 2")
ax.set_zlabel("t-SNE Component 3")
plt.show()

t-SNE (t-distributed Stochastic Neighbor Embedding) is a dimensionality reduction technique commonly used for visualizing high-dimensional data in a lower-dimensional space, typically two or three dimensions.