## Load Review Data

In [16]:
import pandas as pd
import json

In [17]:
reviews_filename = '../data/reviews.json'
with open(reviews_filename, 'r') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)
df.rename(columns={'txt': 'review'}, inplace=True)
df = df[df['review'].str.len() >= 3000]
df.head()

Unnamed: 0,item_id,review
48,41335427,I think we can all agree that the basics are a...
158,41335427,I think this may become my second favorite in ...
320,41335427,When I began rereading this series back in Feb...
406,41335427,Vua doc xong lai cuon Harry Potter va Hoang Tu...
443,41335427,Old review I wrote on my blog (http://douglasc...


In [19]:
df_sample = df.sample(10000)
print('Number of unique items:' , df_sample['item_id'].nunique())
print('Number of rows:', df_sample.shape[0])

Number of unique items: 4280
Number of rows: 10000


## Chunk Review

In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [27]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Supabase/gte-small")
vectors = []
for index, row in df_sample.head().iterrows():
    splitted = text_splitter.create_documents([row['review']])
    for i, chunk in enumerate(splitted):
        text = chunk.page_content
        embeddings = model.encode(text)
        vectors.append({
            'item_id': row['item_id'],
            'chunk': text,
            'embedding': embeddings.tolist()
        })

df_chunks = pd.DataFrame(vectors)

No sentence-transformers model found with name Supabase/gte-small. Creating a new one with MEAN pooling.


In [30]:
df_chunks.head()

Unnamed: 0,item_id,chunk,embedding
0,16441531,This book reminds me of my own relationship it...,"[-0.3568374216556549, 0.06629440188407898, 0.2..."
1,16441531,I do want to say to all the people who think T...,"[-0.10588225722312927, 0.3744885325431824, 0.2..."
2,16441531,"If a woman ENJOYS a man like that, it's just l...","[-0.10144024342298508, 0.40626809000968933, 0...."
3,16441531,The entire first half of my book is FILLED wit...,"[-0.5674555897712708, 0.29009532928466797, 0.2..."
4,16441531,And Abby was a total badass in the first half ...,"[-0.6340095400810242, 0.1991787552833557, 0.33..."
