In [1]:
import re
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
import nltk
import json
import os
import datetime
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, make_scorer, f1_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from scipy.stats import loguniform
import random

# Configuracion
warnings.filterwarnings('ignore')
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)


In [8]:
# Parámetros
input_file = r"C:\Users\Vicky\Desktop\All_Beauty.jsonl"
output_file = r"C:\Users\Vicky\Documents\ML_Amazon_Reviews\data_sample\raw/dataset_sample_final.jsonl"
sample_fraction = 0.01
random_seed = 42

# Contar líneas totales y extraer muestra
with open(input_file, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)
sample_size = max(1, int(total_lines * sample_fraction))
selected_lines = sorted(random.sample(range(total_lines), sample_size))

with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
    for i, line in enumerate(f_in):
        if i in selected_lines:
            f_out.write(line)

print(f"Muestra guardada en {output_file}, tamaño: {sample_size} filas")

# Cargar el JSONL en un DataFrame
df_sample = pd.read_json(output_file, lines=True)
print(f"DataFrame creado con {len(df_sample)} filas.")


Muestra guardada en C:\Users\Vicky\Documents\ML_Amazon_Reviews\data_sample\raw/dataset_sample_final.jsonl, tamaño: 7015 filas
DataFrame creado con 7015 filas.


In [9]:
# Limpieza y preprocesamiento
df_reviews_clean = df_sample.copy().dropna().drop(columns=['images', 'asin', 'parent_asin', 'user_id', 'timestamp']).drop_duplicates()
df_reviews_clean['text'] = df_reviews_clean['text'].apply(str) + ' ' + df_reviews_clean['title'].apply(str)
df_reviews_clean['text'] = df_reviews_clean['text'].str.lower()

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def limpiar_texto(texto):
    if not isinstance(texto, str):
        return ""
    texto_limpio = re.sub(r'[^a-zA-ZáéíóúüñÁÉÍÓÚÜÑ\s]', '', texto)
    palabras = texto_limpio.split()
    palabras_filtradas = [palabra for palabra in palabras if palabra not in stop_words]
    return ' '.join(palabras_filtradas)

df_reviews_clean['text_limpio'] = df_reviews_clean['text'].apply(limpiar_texto)
df_reviews_clean = df_reviews_clean.drop(columns=['text'])

# Clasificación del sentimiento
df_reviews_clean['sentimiento'] = pd.cut(df_reviews_clean['rating'], bins=[0, 2, 3, 5], labels=['negativo', 'neutral', 'positivo'], right=True)

nltk.download('punkt')
df_reviews_clean['tokens'] = df_reviews_clean['text_limpio'].apply(lambda x: word_tokenize(str(x).lower()))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vicky\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vicky\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# División de datos
train_df, test_df = train_test_split(df_reviews_clean, test_size=0.2, random_state=42, stratify=df_reviews_clean['sentimiento'])

# Vectorización TF-IDF
vectorizer = TfidfVectorizer(max_features=10_000)
X_train_tfidf = vectorizer.fit_transform(train_df['text_limpio']).astype('float32')
X_test_tfidf = vectorizer.transform(test_df['text_limpio'])
y_train = train_df['rating']
y_test = test_df['rating']

In [11]:
# Ruta de guardado de modelos
model_path = r'C:\Users\Vicky\Documents\ML_Amazon_Reviews\models'

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Definir el modelo base
lr_model = LogisticRegression(max_iter=500)

# Definir los hiperparámetros a probar
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Control de regularización
    'penalty': ['l1', 'l2'],  # Tipos de regularización
    'solver': ['liblinear', 'saga']  # Solvers compatibles con l1 y l2
}

# Configurar GridSearchCV
grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Ajustar el modelo con los datos de entrenamiento
grid_search.fit(X_train_tfidf, y_train)

# Obtener el mejor modelo y hacer predicciones
best_lr_model_grid = grid_search.best_estimator_
y_pred_lr_grid = best_lr_model_grid.predict(X_test_tfidf)

# Evaluar el modelo optimizado
print("🔹 Mejor parámetro encontrado (Grid Search):", grid_search.best_params_)
print("🔹 Regresión Logística Optimizada (Grid Search) - Precisión en Test:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr_grid))
print(classification_report(y_test, y_pred_lr_grid))


🔹 Mejor parámetro encontrado (Grid Search): {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
🔹 Regresión Logística Optimizada (Grid Search) - Precisión en Test:
Accuracy: 0.7317073170731707
              precision    recall  f1-score   support

           1       0.67      0.69      0.68       204
           2       0.41      0.15      0.22        88
           3       0.47      0.29      0.36       106
           4       0.46      0.25      0.32       164
           5       0.80      0.96      0.87       832

    accuracy                           0.73      1394
   macro avg       0.56      0.47      0.49      1394
weighted avg       0.69      0.73      0.70      1394

