### Jupyter Notebook used to preprocess the data and generate the embeddings file
This notebook is used to preprocess the data and generate the embeddings file using all-mpnet-base-v2 model. The embeddings file consists of the embeddings of the reviews. This file will be fetched on RAG when the user queries the system with a specific prompt. 

In [1]:
import pandas as pd

In [2]:
# Load Instagram Review Data
df = pd.read_csv("./instagram-play-store-reviews/instagram.csv")
df.head()

Unnamed: 0,review_description,rating,review_date
0,"The app is good for connecting with friends, f...",3,2023-07-11 23:57:07
1,"Used to be my favorite social media app, but ""...",2,2023-07-22 21:37:09
2,Instagram is the best of all the social media....,5,2023-07-25 03:24:58
3,"I love this app.. but as of late, I have been ...",2,2023-07-09 04:49:57
4,Used to be a great app but there are so many m...,3,2023-07-17 16:47:04


In [3]:
# Preprocess Reviews and Ratings
def preprocess_reviews(df: pd.DataFrame) -> list:
    rating_and_reviews = []
    for index, row in df.iterrows():
        # Handle Missing Values
        if not row["review_description"] or not row["rating"]:
            continue

        # Handle Empty Reviews
        if row["review_description"] == "" or row["review_description"].isspace():
            continue
        
        rating_and_reviews.append({
            "word_count": len(row["review_description"].split(" ")),
            "review_token_count": len(row["review_description"]) / 4,
            "review_char_count": len(row["review_description"]),
            "rating": row["rating"],
            "review": row["review_description"],
            "review_date": row["review_date"]
        })

    return rating_and_reviews

rating_and_reviews = preprocess_reviews(df)
print(rating_and_reviews[0])

{'word_count': 90, 'review_token_count': 124.0, 'review_char_count': 496, 'rating': 3, 'review': "The app is good for connecting with friends, family and even potential business partners. However as of recently I've experienced some problems with the messages portion of the app (ex: themes aren't showing up on my end but are present on other person's end). Idk if it has to do with a bug but it happened all of sudden out of nowhere on both of my pages (one private the other public). But besides the occasional bugs and sometimes the app/website being down randomly, I say it's a decent app.", 'review_date': '2023-07-11 23:57:07'}


In [None]:
df = pd.DataFrame(rating_and_reviews)
df.head()

Further text processing (splitting pages into sentences)

In [6]:
from tqdm import tqdm
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

for item in tqdm(rating_and_reviews):
    doc = nlp(item["review"])
    item["sentences"] = list(doc.sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["sentence_count_spacy"] = len(list(doc.sents))

print(type(rating_and_reviews[0]["sentences"][0]))

100%|██████████| 210542/210542 [00:41<00:00, 5129.28it/s]

<class 'str'>





In [None]:
df = pd.DataFrame(rating_and_reviews)
df.describe().round(2)

In [None]:
# Chunking sentences
# This is done to split the reviews into smaller chunks of sentences -> effective for reviews that are too long. This will be split into chunks of 10 sentences each.
num_sentence_chunk_size = 10

def chunk_sentences(sentences: list, chunk_size: int) -> list:
    chunks = []
    for i in range(0, len(sentences), chunk_size):
        chunks.append(sentences[i:i + chunk_size])
    return chunks


for item in tqdm(rating_and_reviews):
    item["sentence_chunks"] = chunk_sentences(item["sentences"], num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

In [None]:
df = pd.DataFrame(rating_and_reviews)
df.describe().round(2)

In [None]:
# Splitting each chunk into its own item
import re

rating_and_review_chunks = []
for item in tqdm(rating_and_reviews):
    for chunk in item["sentence_chunks"]:
        joined_sentence_chunk = "".join(chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r"\.([A-Z])", r". \1", joined_sentence_chunk)

        rating_and_review_chunks.append({
            "rating": item["rating"],
            "review": item["review"],
            "review_date": item["review_date"],
            "sentence_chunk": joined_sentence_chunk,
            "chunk_char_count": len(joined_sentence_chunk),
            "chunk_word_count": len(joined_sentence_chunk.split(" ")),
            "chunk_token_count": len(joined_sentence_chunk) / 4
        })

In [None]:
df = pd.DataFrame(rating_and_review_chunks)
df.head()

In [None]:
# To create more meaningful chunks, we can filter out chunks that are too short
# This reviews are spammy and do not provide any meaningful 
# The minimum token length is set to 20 -> ~5 words per chunk
min_token_length = 20
for row in df[df["chunk_token_count"] < min_token_length].sample(10).iterrows():
    print(f"Chunk token count: {row[1]['sentence_chunk']} | Text: {row[1]['sentence_chunk']}")

In [47]:
review_and_rating_chunks_over_min_token_length = df[df["chunk_token_count"] >= min_token_length].to_dict(orient="records")

In [None]:
# Load all-mpnet-base-v2 Sentence Transformer Model
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")

In [None]:
%%time

embedding_model.to("cuda")
text_chunks = [item["sentence_chunk"] for item in review_and_rating_chunks_over_min_token_length]
batch_size = 32

# Embedding each chunk of text using the Sentence Transformer Model and storing the embeddings
for i in tqdm(range(0, len(text_chunks), batch_size)):
    batch = text_chunks[i:i + batch_size]
    embeddings = embedding_model.encode(batch)

    for j, e in enumerate(embeddings):
        review_and_rating_chunks_over_min_token_length[i + j]["chunk_embedding"] = e

In [None]:
# Save embeddings to a file
text_chunks_embeddings_df = pd.DataFrame(review_and_rating_chunks_over_min_token_length)
text_chunks_embeddings_df.to_csv("embeddings/text_chunks_embeddings-20token.csv", index=False)