In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from LeIA import SentimentIntensityAnalyzer
import pickle

In [5]:
class SentimentAnalyzer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        compound_scores = []
        neg_scores = []
        pos_scores = []
        neu_scores = []

        for review in X:
            s = self.analyzer.polarity_scores(review)
            compound_scores.append(s['compound'])
            neg_scores.append(s['neg'])
            pos_scores.append(s['pos'])
            neu_scores.append(s['neu'])
        
        return pd.DataFrame({
            'comp_score': compound_scores,
            'neg': neg_scores,
            'pos': pos_scores,
            'neu': neu_scores
        })

# Função para transformar dados (substituindo lambda)
def identity(X):
    return X

def create_pipeline():
    features = FeatureUnion([
        ('sentiment', Pipeline([
            ('extract', FunctionTransformer(identity, validate=False)),  # Função global
            ('analyzer', SentimentAnalyzer())
        ])),
        ('tfidf', TfidfVectorizer(max_features=1000))
    ])
    
    pipeline = Pipeline([
        ('features', features),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    return pipeline

def load_data(file_path):
    return pd.read_parquet(file_path)

In [6]:
file_path6 = '../data/outputs/score_df_3.parquet'  # Ajuste o caminho conforme necessário
df = load_data(file_path6)

df_train = df.copy()
X = df_train['review_comment_message']  # Features (text reviews)
y = df_train['review_score']  # Labels (scores)

model_rf = create_pipeline()
model_rf.fit(X, y)

# Salve o modelo treinado usando pickle
with open('model_rf.pkl', 'wb') as f:
    pickle.dump(model_rf, f)