In [10]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
import numpy as np

# pip install gensim nltk
# Tutorial from NLTK: https://www.nltk.org/data.html
# train the word2vec model that learns the word meaning in the bubble tea reviews

In [3]:
# nltk.download() # this one runs endless
nltk.download('punkt') 
nltk.download('punkt_tab')
# punkt explains: https://www.askpython.com/python-modules/nltk-punkt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
df = pd.read_csv("dataset/All Reviews.csv")
reviews = df["Reviews"]


In [21]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha()] # if word is alphabet letters

tokenized_reviews = []
# tokenize each review
for review in reviews:
    tokenized_reviews.append(preprocess(review))

# tokenized_reviews
# [['babo', 'tea'],['I','love', 'this']... ]

    

Word2Vec

In [41]:
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=2, workers=4)

In [15]:
# model.wv.key_to_index
# {'the': 0, 'and': 1, 'i': 2,...}

In [None]:
# tutorial: https://radimrehurek.com/gensim/models/word2vec.html

In [42]:
# This func transfers review tokens into vectors
def get_review_vector(tokens, model):
    
    # only keep the word in W2V model's vocabulary
    recognized_word_in_review= []
    word_vector = []

    for w in tokens:
        if w in model.wv.key_to_index:
            recognized_word_in_review.append(w)

    # if a review contains no model vocabuary, set its vector 0
    if len(recognized_word_in_review) == 0:
        return np.zeros(model.vector_size)

    # if has value, get the word vector and calculate the avg
    for w in recognized_word_in_review:
        word_vector.append(model.wv[w])

    review_vector = np.mean(word_vector, axis = 0)
    return review_vector

In [43]:
review_vectors = []  
for tokens in tokenized_reviews:
    review_vector = get_review_vector(tokens, w2v_model)
    review_vectors.append(review_vector)

In [44]:
review_vectors = np.array(review_vectors) #convert it into a np array
print(review_vectors.shape)
# 607 reviews, each review is represented as 100-D vector

(607, 100)


Normalize rating & Reshape dimentions: convert 1-5 scale into 0~1

In [45]:
ratings = df["Rating"]
normalized_ratings = np.array(ratings) / 5

# reshape normalized ratings into (607, 1) 
reshaped_ratings = normalized_ratings.reshape(-1,1)
reshaped_ratings.shape

(607, 1)

Combine normalized ratings with review vectors

In [48]:
final_review_vectors = np.hstack((review_vectors, reshaped_ratings))
final_review_vectors.shape

(607, 101)