In this section, we are going to do feature engineering.

In [3]:
import pandas as pd
import os
from transformers import pipeline
from tqdm import tqdm

#We start with setting up paths
PROCESSED_PATH = os.path.join("..", "data", "processed")
INPUT_FILE = os.path.join(PROCESSED_PATH, "merged_transactions.parquet")

#We now want to load the data.
df= pd.read_parquet(INPUT_FILE)
print(f"Data Loaded: {len(df)} rows")

# We now have to handle the missing reviews
#This is the business logic, if a user did not write a review, we fill NaNs with a neutral placeholder text
df['review_comment_message'] = df['review_comment_message'].fillna("neutral")

#We now initialize the AI model, we use Multilingual BERT Model fine-tuned for sentiment
print("Loading AI Model...")
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model= "nlptown/bert-base-multilingual-uncased-sentiment"
)

#The Batch processing logic: we are processing this in chunks so that we can see a progress bar
def get_sentiment_scores(texts):
    """
    Input: List of text strings
    Output: List of scores (1 to 5)
    """
    # The model returns labels like '5 stars', '4 stars'. We want just the integer.
    results = sentiment_pipeline(texts, truncation=True, max_length=512)
    return [int(res['label'].split()[0]) for res in results]

# WARNING: This step can take time (30-60 mins on CPU). 
# For testing now, let's run it on the first 1,000 rows to make sure it works.
# Once it works, you can remove the [:1000] slice to run on all data.

print("Starting Sentiment Analysis on sample (first 1000 rows)...")
sample_df = df.head(1000).copy() # <--- CHANGE THIS later to run on full data

# We convert the text column to a list
texts = sample_df['review_comment_message'].tolist()

# Run the model
scores = get_sentiment_scores(texts)

# Save the scores back to the dataframe
sample_df['sentiment_score'] = scores

print("Preview of Results:")
print(sample_df[['review_comment_message', 'review_score', 'sentiment_score']].head(10))

# 6. Save (Partial)
# sample_df.to_parquet(os.path.join(PROCESSED_PATH, "transactions_with_sentiment.parquet"))

Data Loaded: 110840 rows
Loading AI Model...


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Starting Sentiment Analysis on sample (first 1000 rows)...
Preview of Results:
                              review_comment_message  review_score  \
0  Não testei o produto ainda, mas ele veio corre...           4.0   
1                               Muito bom o produto.           4.0   
2                                            neutral           5.0   
3  O produto foi exatamente o que eu esperava e e...           5.0   
4                                            neutral           5.0   
5                                            neutral           4.0   
6                                            neutral           5.0   
7                                            neutral           1.0   
8                                            neutral           5.0   
9                         Aguardando retorno da loja           1.0   

   sentiment_score  
0                3  
1                5  
2                3  
3                5  
4                3  
5                3  
6  

since we ran the above code on 1000 rows, we now need to run it on the entire dataset of 110k rows.

In [4]:
# --- PRODUCTION RUN ---

# 1. Choose your size
# Set this to None to run ALL data (Best for final project)
# Set to 20000 if you want to finish in ~15 mins for testing
SAMPLE_SIZE = None 

if SAMPLE_SIZE:
    print(f"Running Fast Track on {SAMPLE_SIZE} rows...")
    df_to_process = df.head(SAMPLE_SIZE).copy()
else:
    print("Running Full Production on ALL rows (This may take a while)...")
    df_to_process = df.copy()

# 2. Optimized Processing Function
def get_sentiment_batches(text_list, batch_size=32):
    # We send text to the model in groups of 32 to speed up CPU processing
    results = []
    # tqdm makes the progress bar look nice
    for i in tqdm(range(0, len(text_list), batch_size)):
        batch = text_list[i : i + batch_size]
        predictions = sentiment_pipeline(batch, truncation=True, max_length=512)
        # Extract just the star rating (integer)
        scores = [int(p['label'].split()[0]) for p in predictions]
        results.extend(scores)
    return results

# 3. Run It
texts = df_to_process['review_comment_message'].tolist()
print("Starting batch processing...")

# This is where the magic happens
df_to_process['sentiment_score'] = get_sentiment_batches(texts)

# 4. Save Final Output
output_file = os.path.join(PROCESSED_PATH, "transactions_with_sentiment.parquet")
df_to_process.to_parquet(output_file, index=False)

print(f"DONE! Processed data saved to: {output_file}")
print(df_to_process[['review_score', 'sentiment_score']].head())

Running Full Production on ALL rows (This may take a while)...
Starting batch processing...


100%|██| 3464/3464 [3:39:29<00:00,  3.80s/it]


DONE! Processed data saved to: ..\data\processed\transactions_with_sentiment.parquet
   review_score  sentiment_score
0           4.0                3
1           4.0                5
2           5.0                3
3           5.0                5
4           5.0                3


We now want to do Transformation or aggregation. We will do a time series and aggregate this data by day.