In [15]:
reviews =["I love this series. I bought 1 and 2 on DVD",
         "The Lion King 1 1/2 is a very cute story to go",
         "Now this is the sort of film we used to get"]

In [16]:
reviews

['I love this series. I bought 1 and 2 on DVD',
 'The Lion King 1 1/2 is a very cute story to go',
 'Now this is the sort of film we used to get']

In [17]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()

# For Stemming
corpus = []
for i in range(0, len(reviews)):
    review = re.sub('[^a-zA-Z]', ' ', str(reviews[i]))
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
corpus

['love seri bought dvd', 'lion king cute stori go', 'sort film use get']

In [5]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500) #Max feature means top features in the corpus.
X_bow = cv.fit_transform(corpus).toarray()

In [12]:
X_bow # Why 13 values in each sentence because in BOW words the unique words after stopwords in placed at the column level as the corpus has 13 words

array([[1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1]])

In [9]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X_tfidf = tv.fit_transform(corpus).toarray()

In [13]:
X_tfidf # Why 13 values in each sentence because in BOW words the unique words after stopwords in placed at the column level as the corpus has 13 words

array([[0.5      , 0.       , 0.5      , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.5      , 0.5      , 0.       , 0.       ,
        0.       ],
       [0.       , 0.4472136, 0.       , 0.       , 0.       , 0.4472136,
        0.4472136, 0.4472136, 0.       , 0.       , 0.       , 0.4472136,
        0.       ],
       [0.       , 0.       , 0.       , 0.5      , 0.5      , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.5      , 0.       ,
        0.5      ]])

In [18]:
# Cretaing the Continious Bag of Words
import gensim.downloader as api
import gensim
wv = api.load('word2vec-google-news-300') # google news data has 300 dimension
model=gensim.models.Word2Vec(corpus,window=5,min_count=2,vector_size=10)



In [19]:
model.corpus_count

3

In [20]:
# Convert each document into a fixed-size vector
X_cbow = [
    sum(model.wv[word] for word in doc if word in model.wv) / len(doc) if len(doc) > 0 else [0.0] * 100
    for doc in corpus
]

In [None]:
X_cbow

[array([-0.00294894,  0.00111289,  0.00503267,  0.00527408,  0.00091203,
        -0.0025593 ,  0.01455606,  0.02898431, -0.02579695,  0.00405259],
       dtype=float32),
 array([-0.01151155,  0.01597452,  0.02001062,  0.01496016, -0.00323439,
        -0.02527533,  0.02799498,  0.01465593, -0.03285246, -0.01870072],
       dtype=float32),
 array([-0.00673193,  0.02054774,  0.00682282,  0.00149946, -0.00512805,
        -0.01443213,  0.01214427,  0.02612625, -0.02498194,  0.00209347],
       dtype=float32)]

In [21]:
# Creating the Avgword2vec
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)

    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [22]:
from tqdm import tqdm
import numpy as np
#apply for the entire sentences
X_avg=[]
for i in tqdm(range(len(corpus))):
    X_avg.append(avg_word2vec(corpus[i]))

100%|██████████| 3/3 [00:00<00:00, 1785.06it/s]


In [23]:
X_avg

[array([-0.00327661,  0.00123655,  0.00559185,  0.00586009,  0.00101337,
        -0.00284367,  0.0161734 ,  0.03220479, -0.02866328,  0.00450287],
       dtype=float32),
 array([-0.01260788,  0.0174959 ,  0.0219164 ,  0.01638494, -0.00354243,
        -0.02768251,  0.03066117,  0.01605173, -0.03598127, -0.02048174],
       dtype=float32),
 array([-0.00762952,  0.02328744,  0.00773253,  0.00169938, -0.00581179,
        -0.01635641,  0.0137635 ,  0.02960975, -0.02831287,  0.0023726 ],
       dtype=float32)]