# Imports

In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
#from nltk.sentiment import SentimentIntensityAnalyzer
from LeIA import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

# Funções

In [2]:
# Define custom transformer for sentiment analysis using LeIA
class SentimentAnalyzer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        compound_scores = []
        neg_scores = []
        pos_scores = []
        neu_scores = []

        for review in X:
            s = self.analyzer.polarity_scores(review)
            compound_scores.append(s['compound'])
            neg_scores.append(s['neg'])
            pos_scores.append(s['pos'])
            neu_scores.append(s['neu'])
        
        polarity = np.select(
            [np.array(compound_scores) > 0.0, np.array(compound_scores) < 0.0, np.array(compound_scores) == 0.0],
            [1, -1, 0]
        )
        
        return pd.DataFrame({
            'comp_score': compound_scores,
            'neg': neg_scores,
            'pos': pos_scores,
            'neu': neu_scores,
            'polarity': polarity
        })

def create_pipeline():
    # Define text and numeric processing
    text_features = Pipeline([
        ('sentiment', SentimentAnalyzer()),
    ])
    
    numeric_features = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_features, 'review_comment_message'),  # Use the correct column name here
            ('num', numeric_features, [
                'payment_installments', 'payment_value', 'product_name_length',
                'product_description_length', 'product_photos_qty', 'product_weight_g',
                'freight_value_factor', 'actual_est_delivery_diff',
                'delivery_duration', 'delivery_time_diff'
            ])
        ]
    )
    
    # Full pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    return pipeline

# Load and prepare the data
def load_data(file_path):
    return pd.read_parquet(file_path)

# Main

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

file_path6 = '../data/outputs/score_df_3.parquet' # Filtro 1 // 2, 3 e 4 // 5
df = load_data(file_path6)

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'review_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'product_category_name',
       'product_name_length', 'product_description_length',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'seller_zip_code_prefix',
       'seller_city', 'seller_state', 'comp_score', 'neg', 'pos', 'neu',
       'polarity', 'review_score_factor', 'price_fa

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mater\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mater\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Add derived features
df['freight_value_factor'] = np.where(df['freight_value'] >= df['freight_value'].median(), 1, 0)

# Prepare training data
df_train = df.copy()
X = df_train.drop('review_score', axis=1, errors='ignore')
y = df_train.get('review_score', pd.Series(np.zeros(len(df)), name='review_score'))

# Drop 'price_factor' from the dataset
X = X.drop(['price_factor'], axis=1, errors='ignore')

# Create and train model
model_rf = create_pipeline()
model_rf.fit(X, y)

# Function to predict rating based on user input
def predict_rating(review_text, payment_installments, payment_value, product_name_length, 
                   product_description_length, product_photos_qty, product_weight_g, 
                   actual_est_delivery_diff, delivery_duration, delivery_time_diff):
    # Create DataFrame for new input
    new_data = pd.DataFrame({
        'review_comment_message': [review_text],
        'payment_installments': [payment_installments],
        'payment_value': [payment_value],
        'product_name_length': [product_name_length],
        'product_description_length': [product_description_length],
        'product_photos_qty': [product_photos_qty],
        'product_weight_g': [product_weight_g],
        'freight_value_factor': [np.where(product_weight_g >= df['freight_value'].median(), 1, 0)],
        'actual_est_delivery_diff': [actual_est_delivery_diff],
        'delivery_duration': [delivery_duration],
        'delivery_time_diff': [delivery_time_diff]
    })

    # Make prediction
    prediction = model_rf.predict(new_data)
    
    return prediction[0]


In [26]:

# Example usage
review_text = "odiei a compra"
payment_installments = 10
payment_value = 1.0
product_name_length = 20
product_description_length = 10
product_photos_qty = 1
product_weight_g = 5000
actual_est_delivery_diff = 1
delivery_duration = 100
delivery_time_diff = 1

predicted_rating = predict_rating(review_text, payment_installments, payment_value, 
                                   product_name_length, product_description_length, 
                                   product_photos_qty, product_weight_g, 
                                   actual_est_delivery_diff, delivery_duration, 
                                   delivery_time_diff)

print(f"The predicted rating for the review is: {predicted_rating}")

The predicted rating for the review is: 1
