### Load Gensim Library

In [None]:
!pip install gensim --quiet

In [None]:
import gensim

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [None]:
#This is needed only if you have uploaded data to Google drive
from google.colab import drive
drive.mount('/gdrive')

In [None]:
import pandas as pd

#change file path to point to where you have stored the zip file.
df = pd.read_csv('/gdrive/My Drive/AI-ML/unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

In [None]:
print('Number of examples in Dataset: ', df.shape)
df.head()

In [None]:
df.loc[0, 'review']

### Function to Clean up data

In [None]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)
    words = string.strip().lower().split()
    words = [w for w in words if len(w)>=1]
    return " ".join(words)
  except:
    return ""

### Clean the Data using routine above

In [None]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

### Convert Review to a Word List

In [None]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

In [None]:
print(len(documents))

In [None]:
print(documents[0])

In [None]:
df.loc[0, 'review']

### Build the Model

In [None]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this
                               workers=4, #Number of CPU Cores
                               vector_size=50,  #Embedding size
                               window=5, #Neighbours on the left and right
                               epochs=10   #Number of iterations over the text corpus
                              )

In [None]:
#documents[0]

# Exploring the model

### How many words in the model

In [None]:
#Model size
model.wv.vectors.shape

In [None]:
# Vocablury of the model
model.wv.key_to_index

### Get an embedding for a word

In [None]:
model.wv['great']

### Finding Words which have similar meaning

In [None]:
model.wv.most_similar('great', topn=15)

In [None]:
model.wv.most_similar('man', topn=15)

### Find the word which is not like others

In [None]:
model.wv.doesnt_match(["man", "woman", "kitchen", "child"])

### Saving the model

In [None]:
model.save('word2vec-movie-50')

In [None]:
!ls -l

In [None]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [None]:
model.wv.most_similar(positive=['actor','man'], negative=['actress'])

#### Loading a Pre-trained model

In [None]:
import gensim.downloader as api

In [None]:
model = api.load("fasttext-wiki-news-subwords-300")

In [None]:
model.vectors.shape

In [None]:
model.key_to_index

In [None]:
model.wv.most_similar('great', topn=15)