In [1]:
import pandas as pd
import numpy as np
from LeIA import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Define custom transformer for sentiment analysis using LeIA
class SentimentAnalyzer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        compound_scores = []
        neg_scores = []
        pos_scores = []
        neu_scores = []

        for review in X:
            s = self.analyzer.polarity_scores(review)
            compound_scores.append(s['compound'])
            neg_scores.append(s['neg'])
            pos_scores.append(s['pos'])
            neu_scores.append(s['neu'])
        
        polarity = np.select(
            [np.array(compound_scores) > 0.0, np.array(compound_scores) < 0.0, np.array(compound_scores) == 0.0],
            [1, -1, 0]
        )
        
        return pd.DataFrame({
            'comp_score': compound_scores,
            'neg': neg_scores,
            'pos': pos_scores,
            'neu': neu_scores,
            'polarity': polarity
        })

def create_pipeline():
    # Define text processing pipeline
    text_features = Pipeline([
        ('sentiment', SentimentAnalyzer()),
    ])
    
    # Full pipeline
    pipeline = Pipeline([
        ('preprocessor', text_features),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    return pipeline

# Load and prepare the data
def load_data(file_path):
    return pd.read_parquet(file_path)


In [3]:
file_path6 = '../data/outputs/score_df_3.parquet'  # Atualize o caminho conforme necessário
df = load_data(file_path6)

# Prepare training data
df_train = df.copy()
X = df_train['review_comment_message']  # Usar apenas a coluna de texto
y = df_train['review_score']

# Create and train model
model_rf = create_pipeline()
model_rf.fit(X, y)

# Function to predict rating based on user input
def predict_rating(review_text):
    # Create DataFrame for new input
    new_data = pd.DataFrame({
        'review_comment_message': [review_text]
    })

    # Make prediction
    prediction = model_rf.predict(new_data)
    
    return prediction[0]


In [4]:

# Example usage
review_text = "odiei a compra"
predicted_rating = predict_rating(review_text)

print(f"The predicted rating for the review is: {predicted_rating}")

The predicted rating for the review is: 5
