# Encode the Reviews of the Dataset

In [None]:
import pandas as pd

df = pd.read_json('Software_5.json.gz', lines=True, compression='gzip')
df.head()

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
review_embeddings = model.encode(df['reviewText'].tolist(), convert_to_tensor=True)

# Save the Encoding Reviews for Later Use

In [None]:
# Save the SentenceTransformer model and computed embeddings to disk
# Run this once after you have instantiated `model` and computed `review_embeddings`.

# 1) Save the model local copy (prevents re-downloading/re-instantiating)
model.save('models/all-MiniLM-L6-v2')

# 2) Save embeddings and the dataframe for later use
import numpy as np

# If review_embeddings is a torch tensor (convert_to_tensor=True), convert to numpy first
try:
    emb_numpy = review_embeddings.cpu().numpy()
except Exception:
    # if already numpy
    emb_numpy = np.array(review_embeddings)

np.save('review_embeddings.npy', emb_numpy)

# Save the dataframe (so you can map embeddings back to reviews)
# This saves the entire dataframe; change to subset if you prefer.
df.to_pickle('reviews_df.pkl')

print('Saved model -> models/all-MiniLM-L6-v2')
print('Saved embeddings -> review_embeddings.npy')
print('Saved dataframe -> reviews_df.pkl')

Saved model -> models/all-MiniLM-L6-v2
Saved embeddings -> review_embeddings.npy
Saved dataframe -> reviews_df.pkl
