In [10]:
import pandas as pd
import numpy as np
from LeIA import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

In [11]:
# Definir transformador de sentimentos
class SentimentAnalyzer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        compound_scores = []
        neg_scores = []
        pos_scores = []
        neu_scores = []

        for review in X:
            s = self.analyzer.polarity_scores(review)
            compound_scores.append(s['compound'])
            neg_scores.append(s['neg'])
            pos_scores.append(s['pos'])
            neu_scores.append(s['neu'])
        
        polarity = np.select(
            [np.array(compound_scores) > 0.0, np.array(compound_scores) < 0.0, np.array(compound_scores) == 0.0],
            [1, -1, 0]
        )
        
        return pd.DataFrame({
            'comp_score': compound_scores,
            'neg': neg_scores,
            'pos': pos_scores,
            'neu': neu_scores,
            'polarity': polarity
        })

def create_pipeline(text_scale_factor=1.0):
    text_features = Pipeline([
        ('sentiment', SentimentAnalyzer()),
        ('scaler', FeatureScaler(scale_factor=text_scale_factor))
    ])
    
    numeric_features = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_features, 'review_comment_message'),
            ('num', numeric_features, [
                'payment_installments', 'payment_value', 'product_name_length',
                'product_description_length', 'product_photos_qty', 'product_weight_g',
                'freight_value_factor', 'actual_est_delivery_diff',
                'delivery_duration', 'delivery_time_diff'
            ])
        ]
    )
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    return pipeline

# Carregar dados
def load_data(file_path):
    return pd.read_parquet(file_path)

In [12]:
# Load data
file_path6 = '../data/outputs/score_df_3.parquet'  # Path to your parquet file
df = load_data(file_path6)

# Adicionar características derivadas
df['freight_value_factor'] = np.where(df['freight_value'] >= df['freight_value'].median(), 1, 0)

# Preparar dados para treino
df_train = df.copy()
X = df_train.drop('review_score', axis=1, errors='ignore')
y = df_train['review_score']

# Aplicar SMOTE para balancear os dados
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Criar e treinar modelo com dados balanceados
text_scale_factor = 2.0
model_rf = create_pipeline(text_scale_factor=text_scale_factor)
model_rf.fit(X_resampled, y_resampled)

# Avaliar o modelo
y_pred = model_rf.predict(X)
print(classification_report(y, y_pred))

# Função para prever a nota
def predict_rating(review_text, payment_installments, payment_value, product_name_length, 
                   product_description_length, product_photos_qty, product_weight_g, 
                   actual_est_delivery_diff, delivery_duration, delivery_time_diff):
    new_data = pd.DataFrame({
        'review_comment_message': [review_text],
        'payment_installments': [payment_installments],
        'payment_value': [payment_value],
        'product_name_length': [product_name_length],
        'product_description_length': [product_description_length],
        'product_photos_qty': [product_photos_qty],
        'product_weight_g': [product_weight_g],
        'freight_value_factor': [np.where(product_weight_g >= df['freight_value'].median(), 1, 0)],
        'actual_est_delivery_diff': [actual_est_delivery_diff],
        'delivery_duration': [delivery_duration],
        'delivery_time_diff': [delivery_time_diff]
    })

    prediction = model_rf.predict(new_data)
    
    return prediction[0]

# Testar função de previsão
review_text = "This product is excellent and arrived on time."
predicted_rating = predict_rating(review_text, 3, 150.0, 20, 100, 3, 500, 2, 5, 1)
print(f"The predicted rating for the review is: {predicted_rating}")

ValueError: could not convert string to float: 'e481f51cbdc54678b7cc49136f2d6af7'

In [8]:
# Example usage
review_text = "o produto é satisfatorio mas eu nunca mais compraria nessa loja"
payment_installments = 3
payment_value = 15.0
product_name_length = 20
product_description_length = 100
product_photos_qty = 3
product_weight_g = 500
actual_est_delivery_diff = 2
delivery_duration = 5
delivery_time_diff = 1

predicted_rating = predict_rating(review_text, payment_installments, payment_value, 
                                   product_name_length, product_description_length, 
                                   product_photos_qty, product_weight_g, 
                                   actual_est_delivery_diff, delivery_duration, 
                                   delivery_time_diff)

print(f"The predicted rating for the review is: {predicted_rating}")

The predicted rating for the review is: 5
