In this section, we are going to do feature engineering.

In [None]:
import pandas as pd
import os
from transformers import pipeline
from tqdm import tqdm

#We start with setting up paths
PROCESSED_PATH = os.path.join("..", "data", "processed")
INPUT_FILE = os.path.join(PROCESSED_PATH, "merged_transactions.parquet")

#We now want to load the data.
df= pd.read_parquet(INPUT_FILE)
print(f"Data Loaded: {len(df)} rows")

# We now have to handle the missing reviews
#This is the business logic, if a user did not write a review, we fill NaNs with a neutral placeholder text
df['review_comment_message'] = df['review_comment_message'].fillna("neutral")

#We now initialize the AI model, we use Multilingual BERT Model fine-tuned for sentiment
print("Loading AI Model...")
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model= "nlptown/bert-base-multilingual-uncased-sentiment"
)

#The Batch processing logic: we are processing this in chunks so that we can see a progress bar
def get_sentiment_scores(texts):
    """
    Input: List of text strings
    Output: List of scores (1 to 5)
    """
    # The model returns labels like '5 stars', '4 stars'. We want just the integer.
    results = sentiment_pipeline(texts, truncation=True, max_length=512)
    return [int(res['label'].split()[0]) for res in results]

# WARNING: This step can take time (30-60 mins on CPU). 
# For testing now, let's run it on the first 1,000 rows to make sure it works.
# Once it works, you can remove the [:1000] slice to run on all data.

print("Starting Sentiment Analysis on sample (first 1000 rows)...")
sample_df = df.head(1000).copy() # <--- CHANGE THIS later to run on full data

# We convert the text column to a list
texts = sample_df['review_comment_message'].tolist()

# Run the model
scores = get_sentiment_scores(texts)

# Save the scores back to the dataframe
sample_df['sentiment_score'] = scores

print("Preview of Results:")
print(sample_df[['review_comment_message', 'review_score', 'sentiment_score']].head(10))

# 6. Save (Partial)
# sample_df.to_parquet(os.path.join(PROCESSED_PATH, "transactions_with_sentiment.parquet"))

Data Loaded: 110840 rows
Loading AI Model...


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Starting Sentiment Analysis on sample (first 1000 rows)...
