### Learning Objectives:

At the end of the experiment, you will be able to:

* understand word2vec in action.

In this experiment we will use **Mahabharata** as our text corpus

### Setup Steps

In [None]:
#@title Run this cell to complete the setup for this Notebook
from IPython import get_ipython
import warnings
warnings.filterwarnings("ignore")

ipython = get_ipython()
  
notebook= "Demo_Mahabharata" #name of the notebook
ipython.magic("sx wget https://www.dropbox.com/s/9likrxgsri97lho/MB.txt") 
ipython.magic("sx wget https://www.dropbox.com/s/9b82ivoh37b1uz9/word2vec.png")
ipython.magic("sx pip3 install gensim")
print("Setup completed successfully")

In [None]:
!ls

### Import required packages

In [None]:
# Importing nltk package
import nltk

# Downloading wordnet from NLTK to perform Stemmer
nltk.download('wordnet')

# Python library for Vector space modeling and topic modeling
import gensim

# Regular Expression
import re

# Basic Python Packages
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

### Pre-Processing and Develop Word Embeddings

* Load the Mahabharata corpus
* Perform Stemming and removing the stop words
* Generate word embeddings

In [None]:
# Stemmer with Python nltk package
stemmer = nltk.PorterStemmer()

# Download all the stopwords from the NLTK package using nltk.download('stopwords')
nltk.download('stopwords')
from nltk.corpus import stopwords  
stopWords = set(stopwords.words('english'))

In [None]:
print(stemmer.stem("policy"))
print(stemmer.stem("police"))
print(stemmer.stem("authorized"))
print(stemmer.stem("using"))

In [None]:
print(stopWords)

In [None]:
MB_words = []          

# Open the text file in read mode
with open("MB.txt", 'r') as file:

   # Store each line in the file as a separate element in a list
   lines = file.readlines()
   
   # Take each line from the list of lines and collect all the words
   for line in lines:
      # findall() function returns a list containing all matches between a-z
      words = re.findall(r'(\b[a-z][a-z]*\b)', line.lower()) 
      
      # Stemming each word in to a list, if the word is not in stopwords
      words = [stemmer.stem(word) for word in words if word not in stopWords]

      MB_words.append(words)                    

Get the vocabulary and vectors using gensim package. 

**min_count** ignore words that appear less than the specified count

In [None]:
# Train the gensim model on the MB_words
model = gensim.models.Word2Vec(MB_words, min_count=120) 

In [None]:
# Total number of words in the trained model
print("Total number of words in the trained model: ", len(model.wv.key_to_index)) 
print(model.wv.key_to_index)

In [None]:
# Number of vectors generated for each word
print("Dimensionality of word embeddings: ", len(model.wv.vectors[0]))
print("Number of words: ", len(model.wv.vectors))

In [None]:
print(model.wv["krishna"])

### Construct the word and vector list by iterating through the vocabulary of the pretrained word2vec model.

In [None]:
words_list = [] 
vector_list = [] 

for v in model.wv.key_to_index:
    try :
        #print(model.wv[v])
        words_list.append(v)
        vector_list.append(model.wv[v])
    except :
        pass
    
words_list = np.array(words_list)
vector_list = np.array(vector_list)

print(words_list.shape)

### Visualization and Plotting the reduced Word2Vec representation

* As vector_list dimensions are huge, reduce the dimensions of the vectors to 2D using PCA 

In [None]:
# Check the shape of the vector_list before reducing its dimensions
print("Shape of the vectors_list before reducing the dimensions: ", vector_list.shape)

* Applying PCA to reduce the dimensions of the vectors

In [None]:
# Create a 2-dimensional PCA model of the word vectors using the scikit-learn PCA class
from sklearn.decomposition import PCA

# n_components in PCA specifies the no.of dimensions
pca = PCA(n_components=2)

# Fit and transform the vectors using PCA model
reduced_vector = pca.fit_transform(vector_list)

In [None]:
# Check the shape of the reduced_vector after reducing its dimensions
print("Shape of the vectors_list after reducing the dimensions to 2D: ", reduced_vector.shape)

* Visualize the reduced Word2Vec representation

In [None]:
colors = ['green' for i in range(len(reduced_vector))]
x = []
y = []
for vec in reduced_vector:
    x.append(vec[0])
    y.append(vec[1])
plt.figure(figsize=(28,20)) 
for i in range(len(words_list)):
    plt.scatter(x[i],y[i], color=colors[i])
    plt.annotate(words_list[i], xy=(x[i], y[i]))
plt.show()

### Choose few characters from Mahabharata and find the similar characters

* Find the location of the chosen characters in word_list


 

In [None]:
MB_characters = ['krishna', 'arjuna', 'pandu', 'bhima', 'sakuni', 'duryodhana', 'bhishma', 'kunti', 'karna', 'madri', 'nakula', 'sahadeva', 'draupadi']

# Get the location of MB_characters from the words_list
locs = [np.where(words_list == x)[0][0] for x in MB_characters]

print("The location of the selected characters \n", locs)

* Visualization of the chosen characters in the Mahabharata

In [None]:
# Generating the vectors for chosen characters and reducing the dimensions using PCA to plot in 2-D plane

fig = plt.figure(figsize=(14,6))
ax = fig.add_subplot(111)
for character, pos in zip(MB_characters, locs):
    char_v = model.wv[character]
    # 'char_v' contains the vector representation of each character
    # Adding one more dimension to the 'char_v', while PCA allows only 2-d array 
    # Converting it back to 1-d array to plot the transformed vector
    value = pca.transform([char_v])[0]
    ax.plot(value[0], value[1],  "r*")
    plt.annotate(words_list[pos], xy=value, xytext=value+0.01)

plt.show()
fig.savefig('word2vec.png')

* Find the top-5 similar characters for the selected characters

In [None]:
names= []
for character in MB_characters:
    near = model.wv.most_similar(character, topn = 10)
    nearNames = [x[0] for x in near]
    names.append(nearNames)

pd.DataFrame(names,columns=['Similarity_1','Similarity_2','Similarity_3','Similarity_4','Similarity_5','Similarity_6','Similarity_7','Similarity_8','Similarity_9','Similarity_10'], index = MB_characters)