In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
import numpy as np
import torch

# pip install gensim nltk
# Tutorial from NLTK: https://www.nltk.org/data.html
# train the word2vec model that learns the word meaning in the bubble tea reviews

In [4]:
# nltk.download() # this one runs endless
nltk.download('punkt') 
nltk.download('punkt_tab')
# punkt explains: https://www.askpython.com/python-modules/nltk-punkt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
df = pd.read_csv("dataset/All Reviews.csv")
reviews = df["Reviews"]

### Word2Vec
- tutorial: https://radimrehurek.com/gensim/models/word2vec.html

In [30]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha()] # if word is alphabet letters

tokenized_reviews = []
# tokenize each review
for review in reviews:
    tokenized_reviews.append(preprocess(review))

# tokenized_reviews
# [['babo', 'tea'],['I','love', 'this']... ]    

In [39]:
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=300, window=5, min_count=2, workers=4)

In [40]:
# This func transfers review tokens into vectors
def get_review_vector(tokens, model):
    
    # only keep the word in W2V model's vocabulary
    recognized_word_in_review= []
    word_vector = []

    for w in tokens:
        if w in model.wv.key_to_index: # {'the': 0, 'and': 1, 'i': 2,...}
            recognized_word_in_review.append(w)

    # if a review contains no model vocabuary, set its vector 0
    if len(recognized_word_in_review) == 0:
        return np.zeros(model.vector_size)

    # if has value, get the word vector and calculate the avg
    for w in recognized_word_in_review:
        word_vector.append(model.wv[w])

    review_vector = np.mean(word_vector, axis = 0)
    return review_vector

In [41]:
review_vectors = []  
for tokens in tokenized_reviews:
    review_vector = get_review_vector(tokens, w2v_model)
    review_vectors.append(review_vector)

In [42]:
review_vectors = np.array(review_vectors) #convert it into a np array
print(review_vectors.shape)
# 607 reviews, each review is represented as 100-D vector

(607, 300)


Normalize rating & Reshape dimentions: convert 1-5 scale into 0~1

In [43]:
ratings = df["Rating"]
print(ratings)
normalized_ratings = np.array(ratings) / 5

# reshape normalized ratings into (607, 1) 
reshaped_ratings = normalized_ratings.reshape(-1,1)
reshaped_ratings.shape

0      5
1      5
2      5
3      5
4      5
      ..
602    5
603    5
604    4
605    4
606    5
Name: Rating, Length: 607, dtype: int64


(607, 1)

Combine normalized ratings with review vectors

In [44]:
w2v_rating_combined_vectors = np.hstack((review_vectors, reshaped_ratings))
w2v_rating_combined_vectors.shape

(607, 301)

### TF-IDF

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=300, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(reviews)
tfidf_vectors = tfidf_matrix.toarray()

tfidf_rating_combined_vectors = np.hstack((tfidf_vectors, reshaped_ratings))
tfidf_rating_combined_vectors.shape


(607, 301)

### Llama 2-7b-chat-hf
- meta download: https://www.llama.com/llama-downloads/
- Huggingface: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
- [Github tutorial](https://github.com/meta-llama/llama-models?fbclid=IwZXh0bgNhZW0CMTAAAR5flYgamnz7bViaAtMQsPvPWGc7jCd69MsAmom7zGFl6Mb9ckvqDYBeACJirg_aem_vjalf1yaPdUasJh9lNKd_g)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
llama_model.config.output_hidden_states = True
# model.eval()

Generate vectors by using Transformers

In [8]:
def get_llama_review_vector(review, tokenizer, model):
    
    input = tokenizer(review, return_tensors='pt', truncation=True)

    with torch.no_grad():
        outputs = model(**input)

    hidden_states = outputs.hidden_states[-1]  #get hidden states from the last layer

    vector = hidden_states.mean(dim=1).squeeze().numpy()  #average all tokens

    return vector


In [None]:
all_llama_vectors = []

for review in reviews:
    vector = get_review_vector(review, llama_tokenizer, llama_model)
    all_llama_vectors.append(vector)


### Falcon-7b
- hf: https://huggingface.co/tiiuae/falcon-7b

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
falcon_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
falcon_model.config.output_hidden_states = True

In [None]:
all_falcon_vectors = []
# only use Indianapolis data - 81 entries

for review in reviews:
    vector = get_review_vector(review, falcon_tokenizer, falcon_model)
    all_falcon_vectors.append(vector)


### K-means Clustering

PCA & Silhouette Score

- [PCA tutorial from geeks by geeks](https://www.geeksforgeeks.org/kmeans-clustering-and-pca-on-wine-dataset/)
- [Silhouette score from scikit-learn.org](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html)

In [None]:
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score

# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
X =  np.array(w2v_rating_combined_vectors)

range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(
        "Silhouette analysis for KMeans + W2V clustering on sample data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

plt.show()

In [65]:
df_cluster = pd.read_csv("All Reviews_cluster.csv")

In [67]:
df_cluster.groupby('w2v_cluster')['Rating'].mean()

w2v_cluster
0    4.731156
1    4.538462
2    1.656051
Name: Rating, dtype: float64

In [69]:
df_cluster['w2v_cluster'].value_counts().sort_index()

w2v_cluster
0    398
1     52
2    157
Name: count, dtype: int64

In [68]:
df_cluster.groupby('falcon_cluster')['Rating'].mean()

falcon_cluster
0    3.843678
1    4.110465
Name: Rating, dtype: float64

In [70]:
df_cluster['falcon_cluster'].value_counts().sort_index()

falcon_cluster
0    435
1    172
Name: count, dtype: int64