# **Sentiment Analysis of Olist Customer Reviews**

Data Loading → Data Cleaning → Handling Missing Values → Text Preprocessing (spaCy) → Sentiment Label Creation → Feature Extraction (TF-IDF) → Train-Test Split → Model Training (Logistic Regression) → Model Evaluation → New Review Prediction → Model Saving & Deployment Script

# Library Installation and Imports

In [21]:
# !python -m spacy download pt_core_news_sm

Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
import pandas as pd
import numpy as np

import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import pickle

# Data Loading and Initial Exploration

In [2]:
data = pd.read_csv('olist_order_reviews_dataset.csv')

In [3]:
data.shape

(99224, 7)

In [4]:
data.columns

Index(['review_id', 'order_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp'],
      dtype='object')

In [5]:
pd.set_option('display.max_columns', None)

data.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [6]:
data.tail()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
99219,574ed12dd733e5fa530cfd4bbf39d7c9,2a8c23fee101d4d5662fa670396eb8da,5,,,2018-07-07 00:00:00,2018-07-14 17:18:30
99220,f3897127253a9592a73be9bdfdf4ed7a,22ec9f0669f784db00fa86d035cf8602,5,,,2017-12-09 00:00:00,2017-12-11 20:06:42
99221,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,,"Excelente mochila, entrega super rápida. Super...",2018-03-22 00:00:00,2018-03-23 09:10:43
99222,1adeb9d84d72fe4e337617733eb85149,7725825d039fc1f0ceb7635e3f7d9206,4,,,2018-07-01 00:00:00,2018-07-02 12:59:13
99223,efe49f1d6f951dd88b51e6ccd4cc548f,90531360ecb1eec2a1fbb265a0db0508,1,,"meu produto chegou e ja tenho que devolver, po...",2017-07-03 00:00:00,2017-07-03 21:01:49


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


# Data Copy and Missing Value Check

In [8]:
# create copy of original data before making changes
data1 = data

In [9]:
data1.isnull().sum()

# There are missing values in the columns of 'review_comment_title', 'review_comment_message'. We will see later on to handle this.

Unnamed: 0,0
review_id,0
order_id,0
review_score,0
review_comment_title,87656
review_comment_message,58247
review_creation_date,0
review_answer_timestamp,0


# Column Selection for Sentiment Analysis

*Kept:*
* review_score → Target variable (sentiment label)
* review_comment_message → Main text feature for sentiment

*Dropped:*
* review_id, order_id → Identifiers (not useful)
* review_comment_title → Redundant, sentiment already in message
* review_creation_date, review_answer_timestamp → Metadata, not relevant

# Data Cleaning and Preprocessing (Step 1)

In [10]:
# Remove irrelevant columns
data1 = data1.drop(['review_id', 'order_id', 'review_comment_title', 'review_creation_date', 'review_answer_timestamp'], axis=1)


In [11]:
for col in data1.columns:
    print(f"\n\n================ Details of '{col}' ================")

    # Unique values
    print(f"\nUnique values of '{col}':\n\n{data1[col].unique()}")

    # Numeric columns → min & max
    if data1[col].dtype != 'O':   # 'O' means object (non-numeric)
        print(f"\nMinimum value in '{col}': {data1[col].min()}")
        print(f"Maximum value in '{col}': {data1[col].max()}")

    # Value counts and Percentages
    vc = data1[col].value_counts()
    perc = ((vc / len(data1)) * 100).round(2)  # rounded percentages

    if vc.shape[0] > 30:  # More than 30 unique values
        print(f"\nValue counts of '{col}' (Top 10):\n\n{vc.head(10)}")
        print(f"\nValue counts of '{col}' (Bottom 10):\n\n{vc.tail(10)}")

        print(f"\nPercentage distribution of '{col}' (Top 10):\n\n{perc.head(10)}")
        print(f"\nPercentage distribution of '{col}' (Bottom 10):\n\n{perc.tail(10)}")

    else:   # Less than 30 unique values
        print(f"\nValue counts of '{col}':\n\n{vc}")
        print(f"\nPercentage distribution of '{col}':\n\n{perc}")




Unique values of 'review_score':

[4 5 1 3 2]

Minimum value in 'review_score': 1
Maximum value in 'review_score': 5

Value counts of 'review_score':

review_score
5    57328
4    19142
1    11424
3     8179
2     3151
Name: count, dtype: int64

Percentage distribution of 'review_score':

review_score
5    57.78
4    19.29
1    11.51
3     8.24
2     3.18
Name: count, dtype: float64



Unique values of 'review_comment_message':

[nan 'Recebi bem antes do prazo estipulado.'
 'Parabéns lojas lannister adorei comprar pela Internet seguro e prático Parabéns a todos feliz Páscoa'
 ...
 'O produto não foi enviado com NF, não existe venda sem NF, com certeza fico no aguardo do envio da NF podendo ser por e-mail.'
 'Excelente mochila, entrega super rápida. Super recomendo essa loja!'
 'meu produto chegou e ja tenho que devolver, pois está com defeito , não segurar carga']

Value counts of 'review_comment_message' (Top 10):

review_comment_message
Muito bom    230
Bom          189
muito bom 

# Handling Missing Reviews

In [12]:
# We are imputing blank reviews (missing or empty review_comment_message)
# with default text labels in Portuguese according to their review_score
# Define mapping from score to default text (Portuguese labels)

score_to_text = {
    1: "muito ruim",   # very bad
    2: "ruim",         # bad
    3: "neutro",       # neutral
    4: "bom",          # good
    5: "muito bom"     # very good
}

# Impute missing or empty review messages with default text based on review_score
data1['review_comment_message'] = data1.apply(
    lambda row: score_to_text[row['review_score']] if pd.isna(row['review_comment_message']) or str(row['review_comment_message']).strip() == ""
    else row['review_comment_message'],axis=1)

# Check again
print(data1.isnull().sum())

review_score              0
review_comment_message    0
dtype: int64


# Text Preprocessing with spaCy

In [14]:
# Load Portuguese language model in spaCy
import spacy

# Disable parser and ner for speed (we only need tokenization + lemmatization + stopwords)
nlp = spacy.load("pt_core_news_sm", disable=["parser","ner"])

# Preprocessing function..
def preprocess(text):
    text = str(text).lower()                      # convert to string and lowercase
    text = re.sub(r"http\S+|www\S+", " ", text)   # remove URLs
    text = re.sub(r"<.*?>", " ", text)            # remove HTML tags

    # Keep only letters, accented Portuguese characters and spaces (remove digits/punctuation)
    text = re.sub(r"[^a-zà-ú\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()     # remove multiple spaces

    # Process text with spaCy (tokenization, lemmatization, stopwords removal)
    doc = nlp(text)
    tokens = [
        token.lemma_   # Lemmatization → convert word to its root form
        for token in doc
        if not token.is_stop and token.is_alpha   # remove stopwords and non-alphabetic tokens
    ]

    # Return preprocessed text as a single string
    return " ".join(tokens)

# Apply preprocessing to all reviews
data1["clean_text"] = data1["review_comment_message"].apply(preprocess)

# Show a few examples
data1[["review_comment_message", "clean_text"]].head()

Unnamed: 0,review_comment_message,clean_text
0,bom,
1,muito bom,
2,muito bom,
3,Recebi bem antes do prazo estipulado.,recebi prazo estipular
4,Parabéns lojas lannister adorei comprar pela I...,parabéns loja lannister adorar comprar Interne...


In [None]:
# checking head again..

data1.head()

Unnamed: 0,review_score,review_comment_message,clean_text
0,4,bom,
1,5,muito bom,
2,5,muito bom,
3,5,Recebi bem antes do prazo estipulado.,recebi prazo estipular
4,5,Parabéns lojas lannister adorei comprar pela I...,parabéns loja lannister adorar comprar Interne...


In [None]:
# check number of missing values in each column

data1.isnull().sum()

Unnamed: 0,0
review_score,0
review_comment_message,0
clean_text,0


# Sentiment Label Creation

In [18]:
# Define target labels (sentiment) based on review_score

# Mapping:
# 1–2 stars → 'negative'
# 3 stars → 'neutral'
# 4–5 stars → 'positive'

def map_sentiment(score):
    if score in [1, 2]:
        return "negative"
    elif score == 3:
        return "neutral"
    else:
        return "positive"

# Apply mapping to create a new column 'sentiment'
data1["sentiment"] = data1["review_score"].apply(map_sentiment)

# Check distribution of sentiment labels
print(data1["sentiment"].value_counts())


sentiment
positive    76470
negative    14575
neutral      8179
Name: count, dtype: int64


In [19]:
# Check first 5 rows again
data1.head()

Unnamed: 0,review_score,review_comment_message,clean_text,sentiment
0,4,bom,,positive
1,5,muito bom,,positive
2,5,muito bom,,positive
3,5,Recebi bem antes do prazo estipulado.,recebi prazo estipular,positive
4,5,Parabéns lojas lannister adorei comprar pela I...,parabéns loja lannister adorar comprar Interne...,positive


# Feature Extraction with TF-IDF

In [20]:
# Convert clean_text into numerical features using TF-IDF
# max_features=5000: limit vocabulary size
# ngram_range=(1,2): consider single words and two-word phrases

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

# Fit TF-IDF on clean_text and transform into feature matrix
X_tfidf = tfidf.fit_transform(data1["clean_text"])

# Target variable
Y = data1["sentiment"]

# Check shape of feature matrix
print(f"TF-IDF feature matrix shape: {X_tfidf.shape}")

TF-IDF feature matrix shape: (99224, 5000)


# Train-Test Split

In [21]:
# X_tfidf → features (numerical representation of clean_text)
# Y → target variable (sentiment labels)
# test_size=0.2 → 20% of data for testing, 80% for training
# random_state=42 → ensures reproducibility
# stratify=Y → maintain the same proportion of sentiment classes in train and test sets

X_train, X_test, Y_train, Y_test = train_test_split(X_tfidf, Y, test_size=0.2, random_state=42, stratify=Y)

# Check the size of train and test sets
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}, Y_test shape: {Y_test.shape}")

X_train shape: (79379, 5000), X_test shape: (19845, 5000)
Y_train shape: (79379,), Y_test shape: (19845,)


# Model Training and Evaluation

In [23]:
# Train Logistic Regression model
# Logistic Regression is a baseline classifier suitable for multi-class sentiment analysis
# max_iter=200 ensures the model converges

model = LogisticRegression(max_iter=200)
model.fit(X_train, Y_train)

# Step: Predict sentiment on the test set
Y_pred = model.predict(X_test)

# Evaluate model performance
# classification_report → shows precision, recall, F1-score for each sentiment class
# confusion_matrix → shows actual vs predicted counts for each class

print("Classification Report:\n")
print(classification_report(Y_test, Y_pred))

print("Confusion Matrix:\n")
cm = confusion_matrix(Y_test, Y_pred)
print(cm)

Classification Report:

              precision    recall  f1-score   support

    negative       0.82      0.84      0.83      2915
     neutral       0.92      0.57      0.71      1636
    positive       0.95      0.98      0.96     15294

    accuracy                           0.93     19845
   macro avg       0.89      0.80      0.83     19845
weighted avg       0.92      0.93      0.92     19845

Confusion Matrix:

[[ 2437    31   447]
 [  284   939   413]
 [  260    50 14984]]


# Prediction on New Review

In [28]:
# Example new review (Portuguese)
new_review = ["O produto chegou rápido e em ótimo estado, recomendo!"]

# Preprocess review
new_review_clean = [preprocess(new_review[0])]

# Convert to TF-IDF features
new_review_tfidf = tfidf.transform(new_review_clean)

# Predict sentiment
predicted_sentiment = model.predict(new_review_tfidf)[0]
print("Predicted Sentiment:", predicted_sentiment)


Predicted Sentiment: positive


# Model Saving

In [29]:
# Save Logistic Regression model
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("Model and TF-IDF vectorizer saved successfully!")

Model and TF-IDF vectorizer saved successfully!


# Deployment Script

In [34]:
code = '''

# sentiment_predictor.py

import pickle
import re
import spacy

# Load spaCy Portuguese model

nlp = spacy.load("pt_core_news_sm", disable=["parser","ner"])

# Preprocessing function

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", " ", text)      # Remove URLs
    text = re.sub(r"<.*?>", " ", text)               # Remove HTML tags
    text = re.sub(r"[^a-zà-ú\s]", " ", text)         # Keep only letters and Portuguese characters
    text = re.sub(r"\s+", " ", text).strip()         # Remove extra spaces

    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)


# Load saved model and TF-IDF vectorizer

with open("sentiment_model.pkl", "rb") as f:
    model = pickle.load(f)
with open("tfidf_vectorizer.pkl", "rb") as f:
    tfidf = pickle.load(f)

# Take user input and predict sentiment

if __name__ == "__main__":
    review_input = "Péssima qualidade, não recomendo"
    clean_review = [preprocess(review_input)]
    X_input = tfidf.transform(clean_review)
    prediction = model.predict(X_input)
    print(f"Predicted Sentiment: {prediction[0]}")

'''

# Define the filename where the Python script will be saved
filename = 'sentiment_predictor.py'

# Write the code to the .py file
with open(filename, 'w') as file:
    file.write(code)

print(f"Script saved as {filename}")


Script saved as sentiment_predictor.py


  text = re.sub(r"[^a-zà-ú\s]", " ", text)         # Keep only letters and Portuguese characters
