In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
import numpy as np
import torch

# pip install gensim nltk
# Tutorial from NLTK: https://www.nltk.org/data.html
# train the word2vec model that learns the word meaning in the bubble tea reviews

In [4]:
# nltk.download() # this one runs endless
nltk.download('punkt') 
nltk.download('punkt_tab')
# punkt explains: https://www.askpython.com/python-modules/nltk-punkt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
df = pd.read_csv("dataset/All Reviews.csv")
reviews = df["Reviews"]

### Word2Vec
- tutorial: https://radimrehurek.com/gensim/models/word2vec.html

In [7]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha()] # if word is alphabet letters

tokenized_reviews = []
# tokenize each review
for review in reviews:
    tokenized_reviews.append(preprocess(review))

# tokenized_reviews
# [['babo', 'tea'],['I','love', 'this']... ]    

In [8]:
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=2, workers=4)

In [9]:
# This func transfers review tokens into vectors
def get_review_vector(tokens, model):
    
    # only keep the word in W2V model's vocabulary
    recognized_word_in_review= []
    word_vector = []

    for w in tokens:
        if w in model.wv.key_to_index: # {'the': 0, 'and': 1, 'i': 2,...}
            recognized_word_in_review.append(w)

    # if a review contains no model vocabuary, set its vector 0
    if len(recognized_word_in_review) == 0:
        return np.zeros(model.vector_size)

    # if has value, get the word vector and calculate the avg
    for w in recognized_word_in_review:
        word_vector.append(model.wv[w])

    review_vector = np.mean(word_vector, axis = 0)
    return review_vector

In [10]:
review_vectors = []  
for tokens in tokenized_reviews:
    review_vector = get_review_vector(tokens, w2v_model)
    review_vectors.append(review_vector)

In [11]:
review_vectors = np.array(review_vectors) #convert it into a np array
print(review_vectors.shape)
# 607 reviews, each review is represented as 100-D vector

(607, 100)


Normalize rating & Reshape dimentions: convert 1-5 scale into 0~1

In [16]:
ratings = df["Rating"]
print(ratings)
normalized_ratings = np.array(ratings) / 5

# reshape normalized ratings into (607, 1) 
reshaped_ratings = normalized_ratings.reshape(-1,1)
reshaped_ratings.shape

0      5
1      5
2      5
3      5
4      5
      ..
602    5
603    5
604    4
605    4
606    5
Name: Rating, Length: 607, dtype: int64


(607, 1)

Combine normalized ratings with review vectors

In [13]:
w2v_rating_combined_vectors = np.hstack((review_vectors, reshaped_ratings))
w2v_rating_combined_vectors.shape

(607, 101)

### TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=300, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(reviews)
tfidf_vectors = tfidf_matrix.toarray()

tfidf_rating_combined_vectors = np.hstack((review_vectors, reshaped_ratings))
tfidf_rating_combined_vectors.shape


(607, 101)

### Llama 2-7b-chat-hf
- meta download: https://www.llama.com/llama-downloads/
- Huggingface: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
- [Github tutorial](https://github.com/meta-llama/llama-models?fbclid=IwZXh0bgNhZW0CMTAAAR5flYgamnz7bViaAtMQsPvPWGc7jCd69MsAmom7zGFl6Mb9ckvqDYBeACJirg_aem_vjalf1yaPdUasJh9lNKd_g)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
llama_model.config.output_hidden_states = True
# model.eval()

Generate vectors by using Transformers

In [8]:
def get_llama_review_vector(review, tokenizer, model):
    
    input = tokenizer(review, return_tensors='pt', truncation=True)

    with torch.no_grad():
        outputs = model(**input)

    hidden_states = outputs.hidden_states[-1]  #get hidden states from the last layer

    vector = hidden_states.mean(dim=1).squeeze().numpy()  #average all tokens

    return vector


In [None]:
all_llama_vectors = []

for review in reviews:
    vector = get_review_vector(review, llama_tokenizer, llama_model)
    all_llama_vectors.append(vector)


### Falcon-7b
- hf: https://huggingface.co/tiiuae/falcon-7b

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
falcon_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
falcon_model.config.output_hidden_states = True

In [None]:
all_falcon_vectors = []
# only use Indianapolis data - 81 entries

for review in reviews:
    vector = get_review_vector(review, falcon_tokenizer, falcon_model)
    all_falcon_vectors.append(vector)


### K-means Clustering