In [7]:
# importing libraries
import numpy as np
import pandas as pd
import string
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from gensim.models import word2vec

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [48]:
# importing dataset
data = pd.read_csv(r"/content/drive/MyDrive/NLP Projects/Word2Vec/friends_episodes_v2.csv")
# data.head()
data = data['Summary'][0:5]  # considering only summary of each episode for analysis
lemmatizer = WordNetLemmatizer()
corpus = []

In [49]:
# preprocessing/cleaning textual data
# using text preprocessing steps lowercasing, stopwords removal, stemming/lemmatization, removing punctuations
def remove_punctuation(sentence):  # function to remove punctuations
    return sentence.translate(str.maketrans('', '', string.punctuation))


for i, text in enumerate(data):
    line = text.lower()
    line = remove_punctuation(line)
    line = " ".join([lemmatizer.lemmatize(word) for word in line.split()])
    filtered_line = " ".join([w for w in line.split() if not w in set(stopwords.words('english'))])
    corpus.append(filtered_line)

In [50]:
# creating vectors in 3 different types usign Bag-Of-Words, TF-IDF and Hashing.
# BOW model
vectorizer = CountVectorizer()
vectorizer.fit(corpus)
print(len(vectorizer.vocabulary_))
print(len(data))
bow_vectors = vectorizer.transform(corpus)  # it is 235*1433 sparse matrix

77
5


In [51]:
x = bow_vectors.toarray()
print(x) #here you can understand the document term matrix generated.

[[0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 2 0 0 0 0 1 0 0
  0 0 0 1 0]
 [1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 0
  0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0
  0 1 0 0 0]
 [0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1
  0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 2
  0 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2 0 1 0 0 0 1
  1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0
  0 0 1 0 1]
 [0 1 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1
  0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0
  1 0 0 0 0]]


In [53]:
feature_names = vectorizer.get_feature_names()

['7000',
 'al',
 'anniversary',
 'apartment',
 'barry',
 'become',
 'becomes',
 'boyfriend',
 'break',
 'butt',
 'carol',
 'chandler',
 'clean',
 'come',
 'depressed',
 'doe',
 'dont',
 'double',
 'eager',
 'engagement',
 'enjoys',
 'everyone',
 'exwife',
 'film',
 'find',
 'first',
 'fun',
 'game',
 'girl',
 'girlfriend',
 'given',
 'ha',
 'habit',
 'hockey',
 'irritated',
 'joey',
 'join',
 'land',
 'laundromat',
 'like',
 'meanwhile',
 'mind',
 'monica',
 'monicas',
 'new',
 'none',
 'obsessiveness',
 'pacinos',
 'parent',
 'phoebe',
 'plan',
 'pose',
 'pregnant',
 'pretend',
 'put',
 'rachel',
 'ratinfested',
 'realize',
 'relationship',
 'responsibility',
 'resume',
 'return',
 'ring',
 'role',
 'ross',
 'slept',
 'smoking',
 'soda',
 'spend',
 'stressed',
 'struggle',
 'take',
 'test',
 'thumb',
 'time',
 'visit',
 'washroom']

In [56]:
# TF-IDF model
tfvectorizer = TfidfVectorizer()
tfvectorizer.fit(corpus)
print(len(vectorizer.vocabulary_))
print(len(data))
tfidf_vectors = tfvectorizer.transform(corpus)  # it is same 235*1433 sparse matrix but the values are not integers anymore

77
5


In [62]:
x = tfidf_vectors.toarray()
print(x) #here it is the same document matrix but instead of frequency counts we get a certain weight for each word denoting the importance of the word.

[[0.         0.         0.         0.         0.2673289  0.
  0.21567927 0.         0.         0.         0.         0.
  0.         0.2673289  0.         0.         0.         0.
  0.         0.2673289  0.         0.         0.2673289  0.
  0.21567927 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.17903328 0.         0.         0.         0.         0.
  0.2673289  0.         0.         0.         0.2673289  0.
  0.         0.17903328 0.         0.         0.         0.
  0.         0.2673289  0.2673289  0.         0.35806656 0.
  0.         0.         0.         0.2673289  0.         0.
  0.         0.         0.         0.2673289  0.        ]
 [0.24947241 0.         0.         0.         0.         0.
  0.20127277 0.24947241 0.         0.         0.         0.1405484
  0.         0.         0.         0.24947241 0.         0.
  0.         0.         0.         

In [66]:
# Hashing model
hashvectorizer = HashingVectorizer(n_features=20) #the advantage is we can convert the vectors dimension into a fixed length rather having long sparse representation but the disadvantage is 
                                     #that we cannot convert the vectors back to words.
hash_vectors = hashvectorizer.fit_transform(corpus)

In [67]:
hash_vectors.toarray()

array([[ 0.        ,  0.        ,  0.5       , -0.25      , -0.25      ,
         0.25      ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.5       , -0.25      ,  0.        ,  0.        ,  0.25      ,
        -0.25      , -0.25      ,  0.        ,  0.        , -0.25      ],
       [ 0.        ,  0.        ,  0.2236068 ,  0.4472136 , -0.2236068 ,
         0.        ,  0.        ,  0.4472136 ,  0.        ,  0.        ,
         0.2236068 ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.2236068 , -0.4472136 ,  0.        ,  0.        ,  0.4472136 ],
       [ 0.        ,  0.        ,  0.2773501 ,  0.        ,  0.        ,
         0.        ,  0.        , -0.2773501 ,  0.        , -0.5547002 ,
         0.        , -0.2773501 , -0.2773501 ,  0.        , -0.5547002 ,
         0.        ,  0.        ,  0.        ,  0.        , -0.2773501 ],
       [ 0.        ,  0.        ,  0.26726124,  0.26726124,  0.26726124,
        -0.26726124,  0.        ,  0.26726124,  