# Title

## Introduction

In [1]:
'''
All the imports for the project
'''

# Basic imports 
import json     # parse json objects
import re       # text clean up
import os       # system checks

# Data processing
import pandas as pd
import numpy as np

# Embeddings
from sentence_transformers import SentenceTransformer       # text -> vector embeddings

# Clustering
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import normalized_mutual_info_score        # cluster accuracy evaluation
from sklearn.feature_extraction.text import TfidfVectorizer     # find keywords

# Visuals
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE       # reduce dimensionality visually

In [2]:
# Read data
path = "Electronics.jsonl"
reviews = []
max_rows = 100000       # take first 100k reviews

# Open jsonl file and add each line (review) to reviews list
with open(path, "r") as f:
    for i, line in enumerate(f):
        if i >= max_rows:
            break
        reviews.append(json.loads(line))

# Convert to DF
df = pd.DataFrame(reviews)

# Display info
df.shape, df.info(), df[["rating", "title", "text"]].head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   rating             100000 non-null  float64
 1   title              100000 non-null  object 
 2   text               100000 non-null  object 
 3   images             100000 non-null  object 
 4   asin               100000 non-null  object 
 5   parent_asin        100000 non-null  object 
 6   user_id            100000 non-null  object 
 7   timestamp          100000 non-null  int64  
 8   helpful_vote       100000 non-null  int64  
 9   verified_purchase  100000 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 7.0+ MB


((100000, 10),
 None,
    rating                                        title  \
 0     3.0            Smells like gasoline! Going back!   
 1     1.0      Didn’t work at all lenses loose/broken.   
 2     5.0                                   Excellent!   
 3     5.0                       Great laptop backpack!   
 4     5.0  Best Headphones in the Fifties price range!   
 5     5.0                        Great Fan! I’m a FAN!   
 6     5.0                    solid sound for the price   
 7     5.0            Love the headphones - great range   
 8     5.0                                   Five Stars   
 9     5.0                               BUY THIS THANG   
 
                                                 text  
 0  First & most offensive: they reek of gasoline ...  
 1  These didn’t work. Idk if they were damaged in...  
 2  I love these. They even come with a carry case...  
 3  I was searching for a sturdy backpack for scho...  
 4  I've bought these headphones three times be

In [3]:
# Process data

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()                         # lower case
    text = re.sub(r"[^a-z0-9\s]", " ", text)    # keep letters, numbers, and spaces
    text = re.sub(r"\s+", " ", text).strip()    # remove extra spaces
    return text

# Create new column with cleaned text
df["cleaned_text"] = df["text"].apply(clean_text)

# Remove faulty text after cleaning
df = df[df["cleaned_text"].str.len() > 0].reset_index(drop=True)

df[["text", "cleaned_text"]].head(10)

Unnamed: 0,text,cleaned_text
0,First & most offensive: they reek of gasoline ...,first most offensive they reek of gasoline so ...
1,These didn’t work. Idk if they were damaged in...,these didn t work idk if they were damaged in ...
2,I love these. They even come with a carry case...,i love these they even come with a carry case ...
3,I was searching for a sturdy backpack for scho...,i was searching for a sturdy backpack for scho...
4,I've bought these headphones three times becau...,i ve bought these headphones three times becau...
5,"Light weight, quiet and totally awesome!!! It ...",light weight quiet and totally awesome it does...
6,Update 2-they sent a new warranty replacement....,update 2 they sent a new warranty replacement ...
7,These are fantastic headphones and I love that...,these are fantastic headphones and i love that...
8,pretty good for the price.,pretty good for the price
9,yes.. so good. just buy it. my favorite featu...,yes so good just buy it my favorite feature is...


In [4]:
# Create vector embeddings from cleaned text

# Use pretrained light model
model = SentenceTransformer("all-MiniLM-L6-v2")

texts = df["cleaned_text"].tolist()
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/1562 [00:00<?, ?it/s]

In [None]:
# Cluster using k means

# Expected 3 groups: positive, neutral, and negative
k = 3

# Run k means and store labels (MBK is a lot more efficient than normal kmeans)
mbk = MiniBatchKMeans(n_clusters=k, batch_size=1000,random_state=42)

batch_size = 1000
n_samples = embeddings.shape[0]

for i in tqdm(range(0, n_samples, batch_size)):
    batch = embeddings[i : i + batch_size]
    mbk.partial_fit(batch)

# Puts each review into a cluster (arbitrary cluster names: 0, 1, 2)
labels = mbk.predict(embeddings)
df["cluster"] = labels
df[["rating", "cleaned_text", "cluster"]].head(10)

  0%|          | 0/100 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 100/100 [00:00<00:00, 358.39it/s]


Unnamed: 0,rating,cleaned_text,cluster
0,3.0,first most offensive they reek of gasoline so ...,2
1,1.0,these didn t work idk if they were damaged in ...,0
2,5.0,i love these they even come with a carry case ...,1
3,5.0,i was searching for a sturdy backpack for scho...,1
4,5.0,i ve bought these headphones three times becau...,0
5,5.0,light weight quiet and totally awesome it does...,1
6,5.0,update 2 they sent a new warranty replacement ...,0
7,5.0,these are fantastic headphones and i love that...,0
8,5.0,pretty good for the price,0
9,5.0,yes so good just buy it my favorite feature is...,2


In [None]:
# Evaluate clusters with ratings

