<a href="https://colab.research.google.com/github/yishanhsieh/llm-segmentation/blob/main/get_review_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Embedding Reveiws
- Run on A100

In [None]:
!pip install transformers

In [None]:
!pip install -U sentence-transformers

In [None]:
import transformers
import torch

In [None]:
!pip install transformers[sentencepiece]

In [None]:
!pip install --upgrade huggingface_hub

In [None]:
from huggingface_hub import login

In [None]:
login()

In [None]:
!pip install pandas==2.2.2
!pip install numpy<3.0.0,>=2.0.0

In [None]:
!pip install gensim nltk

In [None]:
!pip install --force-reinstall gensim

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from gensim.models import Word2Vec
import gensim

# Tutorial from NLTK: https://www.nltk.org/data.html
# train the word2vec model that learns the word meaning in the bubble tea reviews

In [None]:
# nltk.download() # this one runs endless
nltk.download('punkt')
nltk.download('punkt_tab')
# punkt explains: https://www.askpython.com/python-modules/nltk-punkt

## Preprocess Data

In [None]:
import pandas as pd
df = pd.read_csv("All Reviews.csv")

# df['Rating'] = df['Rating'].str.extract(r"(\d+)")
# for i in range(len(df)):
#     if df['Reviews'][i] == 'No review text available':
#         df = df.drop(i)

reviews = df["Reviews"]
df.info()


In [None]:
review_list = list(reviews)

## Word2Vec
- tutorial: https://radimrehurek.com/gensim/models/word2vec.html

In [None]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha()] # if word is alphabet letters

tokenized_reviews = []
# tokenize each review
for review in reviews:
    tokenized_reviews.append(preprocess(review))

# tokenized_reviews
# [['babo', 'tea'],['I','love', 'this']... ]

In [None]:
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=300, window=5, min_count=2, workers=4)

In [None]:
# This func transfers review tokens into vectors
def get_w2v_review_vector(tokens, model):

    # only keep the word in W2V model's vocabulary
    recognized_word_in_review= []
    word_vector = []

    for w in tokens:
        if w in model.wv.key_to_index: # {'the': 0, 'and': 1, 'i': 2,...}
            recognized_word_in_review.append(w)

    # if a review contains no model vocabuary, set its vector 0
    if len(recognized_word_in_review) == 0:
        return np.zeros(model.vector_size)

    # if has value, get the word vector and calculate the avg
    for w in recognized_word_in_review:
        word_vector.append(model.wv[w])

    review_vector = np.mean(word_vector, axis = 0)
    return review_vector

In [None]:
w2v_review_vectors = []
for tokens in tokenized_reviews:
    vector = get_w2v_review_vector(tokens, w2v_model)
    w2v_review_vectors.append(vector)

In [None]:
w2v_review_vectors = np.array(w2v_review_vectors) #convert it into a np array
print(w2v_review_vectors.shape)
# 607 reviews, each review is represented as 100-D vector

Normalize rating & Reshape dimentions: convert 1-5 scale into 0~1

In [None]:
ratings = df["Rating"]
normalized_ratings = np.array(ratings, dtype=float) / 5

# reshape normalized ratings into (607, 1)
reshaped_ratings = normalized_ratings.reshape(-1,1)
reshaped_ratings.shape

Combine normalized ratings with review vectors

In [None]:
w2v_rating_combined_vectors = np.hstack((w2v_review_vectors, reshaped_ratings))
w2v_rating_combined_vectors.shape

In [None]:
review_list  = list(reviews)

## Sentence BERT mini
- Usage (Sentence-Transformers)
- https://huggingface.co/sentence-transformers
- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
bert_mini_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
bert_mini_model.to(device)

In [None]:
# Tokenize sentences
encoded_input = tokenizer(review_list, padding=True, truncation=True, return_tensors='pt').to(device)

# Compute token embeddings
with torch.no_grad():
    model_output = bert_mini_model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
bert_mini_embeddings = F.normalize(sentence_embeddings, p=2, dim=1).squeeze().cpu().numpy()

In [None]:
bert_mini_embeddings.shape

In [None]:
bert_mini_rating_combined_vectors = np.hstack((bert_mini_embeddings, reshaped_ratings))
bert_mini_rating_combined_vectors.shape

## BERT (all-mpnet-base-v2)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

bert_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
bert_model = AutoModelForMaskedLM.from_pretrained("sentence-transformers/all-mpnet-base-v2")

In [None]:
bert_model.config.output_hidden_states = True

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
bert_model.to(device)

In [None]:
def get_bert_vector(review, tokenizer, model):


    input = tokenizer(review, return_tensors='pt', truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**input)

    hidden_states = outputs.hidden_states[-1]  #get hidden states from the last layer

    vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()  #average all tokens

    return vector

In [None]:
all_bert_vectors = []

for review in reviews:
    vector = get_bert_vector(review, bert_tokenizer, bert_model)
    all_bert_vectors.append(vector)

In [None]:
import numpy as np
bert_rating_combined_vectors = np.hstack((all_bert_vectors, reshaped_ratings))
bert_rating_combined_vectors.shape

## Llama 2-7b-chat-hf
- meta download: https://www.llama.com/llama-downloads/
- Huggingface: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
- [Github tutorial](https://github.com/meta-llama/llama-models?fbclid=IwZXh0bgNhZW0CMTAAAR5flYgamnz7bViaAtMQsPvPWGc7jCd69MsAmom7zGFl6Mb9ckvqDYBeACJirg_aem_vjalf1yaPdUasJh9lNKd_g)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
llama_model.config.output_hidden_states = True

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
llama_model.to(device)

In [None]:
def get_llama_vector(review, tokenizer, model):

    input = tokenizer(review, return_tensors='pt', truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**input)

    hidden_states = outputs.hidden_states[-1]  #get hidden states from the last layer

    vector = hidden_states.mean(dim=1).squeeze().cpu().numpy() #average all tokens

    return vector

In [None]:
llama_vectors = []

for review in reviews:
    vector = get_llama_vector(review, llama_tokenizer, llama_model)
    llama_vectors.append(vector)


In [None]:
llama_rating_combined_vectors = np.hstack((llama_vectors, reshaped_ratings))
llama_rating_combined_vectors.shape

## Falcon

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
falcon_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
falcon_model.config.output_hidden_states = True

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
falcon_model.to(device)

Generate vectors by using Transformers

In [None]:
def get_falcon_vector(review, tokenizer, model):


    input = tokenizer(review, return_tensors='pt', truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**input)

    hidden_states = outputs.hidden_states[-1]  #get hidden states from the last layer

    vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()  #average all tokens

    return vector

In [None]:
all_falcon_vectors = []

for review in reviews:
    vector = get_falcon_vector(review, falcon_tokenizer, falcon_model)
    all_falcon_vectors.append(vector)


In [None]:
falcon_rating_combined_vectors = np.hstack((all_falcon_vectors, reshaped_ratings))
falcon_rating_combined_vectors.shape

# Kmeans and evaluation (PCA, sil-score)

In [None]:
# function that calculates Silhouette score and shows PCA plot
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def get_pca_score(model_name, vector):

  for k in range(2,6):
    kmeans = KMeans(n_clusters = k, random_state=42)
    kmeans.fit(vector)
    labels = kmeans.labels_

    # if only use sil_score, the value is extremly small, so I use cosine similarity for sil score
    cosine_dist = pairwise_distances(vector, metric='cosine')
    score = silhouette_score(cosine_dist, labels, metric='precomputed')
    # print(f"Silhouette Score for k={k}: {score:.3f}")

    pca = PCA(n_components=2)
    reduced = pca.fit_transform(vector)
    centers_reduced = pca.transform(kmeans.cluster_centers_)

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(reduced[:, 0], reduced[:, 1], c=labels, cmap='rainbow')
    plt.scatter(centers_reduced[:, 0], centers_reduced[:, 1],
                marker='X', s=100, c='black')
    plt.title(f"{model_name} + K-Means Clustering (K={k}, Sil_score={score:.3f})")
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")
    plt.colorbar(scatter, label='Cluster')

    plt.show()


In [None]:
def score_barchart(model_name, vector):

  k_list = [2,3,4,5] #x
  sil_score_list = [] #y

  for k in k_list:
      kmeans = KMeans(n_clusters = k, random_state=42)
      kmeans.fit(vector)
      labels = kmeans.labels_
      cosine_dist = pairwise_distances(vector, metric='cosine')
      score = silhouette_score(cosine_dist, labels, metric='precomputed')
      sil_score_list.append(score)

  plt.bar(k_list, sil_score_list,width=0.6)
  plt.xlabel("K")
  plt.xticks(k_list, [str(k) for k in k_list])
  plt.ylabel("Silhouette Score")
  plt.title(f"{model_name} + K-Means Clustering")
  plt.show()


### w2v PCA

In [None]:
score_barchart("w2v", w2v_rating_combined_vectors)

In [None]:
# save the best k value
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(w2v_rating_combined_vectors)
df['w2v_cluster'] = clusters

df.to_csv('All Reviews_cluster.csv', index=False)

In [None]:
get_pca_score("w2v", w2v_rating_combined_vectors)

### Sentence Bert mini PCA

In [None]:
score_barchart("Sentence Bert mini", bert_mini_rating_combined_vectors)

In [None]:
get_pca_score("BERT-mini", bert_mini_rating_combined_vectors)

### BERT mpnet PCA

In [None]:
score_barchart("BERT-mpnet", bert_rating_combined_vectors)

In [None]:
get_pca_score("BERT", bert_rating_combined_vectors)

### falcon PCA

In [None]:
score_barchart("falcon", falcon_rating_combined_vectors)

In [None]:
# save the best k value
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(falcon_rating_combined_vectors)
df['falcon_cluster'] = clusters

df.to_csv('All Reviews_cluster.csv', index=False)

In [None]:
get_pca_score("falcon", falcon_rating_combined_vectors)

### llama PCA

In [None]:
score_barchart("llama", llama_rating_combined_vectors)

In [None]:
# save the best k value
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(llama_rating_combined_vectors)
df['llama_cluster'] = clusters

df.to_csv('All Reviews_cluster_llama.csv', index=False)

In [None]:
get_pca_score("llama", llama_rating_combined_vectors)

## Analyze cluster ratings

In [None]:
# df.groupby("cluster")["rating"].mean()

In [None]:
df_cluster = pd.read_csv("All Reviews_cluster.csv")

df_llama_cluster = pd.read_csv("All Reviews_cluster_llama.csv")



In [None]:
df_cluster.head()

In [None]:
df_llama_cluster.head()

In [None]:
df_cluster.groupby('w2v_cluster')['Rating'].mean()

In [None]:
df_cluster['w2v_cluster'].value_counts().sort_index()

In [None]:
df_cluster.groupby('falcon_cluster')['Rating'].mean()

In [None]:
df_cluster['falcon_cluster'].value_counts().sort_index()

In [None]:
df_llama_cluster.groupby("llama_cluster")["Rating"].mean()

In [None]:
df_llama_cluster['llama_cluster'].value_counts().sort_index()