In [None]:
import re
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
import nltk
import json
import os
import datetime
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, make_scorer, f1_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from scipy.stats import loguniform
import random

# Configuracion
warnings.filterwarnings('ignore')
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)


In [None]:
# Parámetros
input_file = r"C:\Users\Vicky\Desktop\All_Beauty.jsonl"
output_file = r"C:\Users\Vicky\Documents\ML_Amazon_Reviews_Sentiment_Analysis\data_sample\raw/dataset_sample_final.jsonl"
sample_fraction = 0.01
random_seed = 42

# Contar líneas totales y extraer muestra
with open(input_file, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)
sample_size = max(1, int(total_lines * sample_fraction))
selected_lines = sorted(random.sample(range(total_lines), sample_size))

with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
    for i, line in enumerate(f_in):
        if i in selected_lines:
            f_out.write(line)

print(f"Muestra guardada en {output_file}, tamaño: {sample_size} filas")

# Cargar el JSONL en un DataFrame
df_sample = pd.read_json(output_file, lines=True)
print(f"DataFrame creado con {len(df_sample)} filas.")


In [9]:
# Limpieza y preprocesamiento
df_reviews_clean = df_sample.copy().dropna().drop(columns=['images', 'asin', 'parent_asin', 'user_id', 'timestamp']).drop_duplicates()
df_reviews_clean['text'] = df_reviews_clean['text'].apply(str) + ' ' + df_reviews_clean['title'].apply(str)
df_reviews_clean['text'] = df_reviews_clean['text'].str.lower()

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def limpiar_texto(texto):
    if not isinstance(texto, str):
        return ""
    texto_limpio = re.sub(r'[^a-zA-ZáéíóúüñÁÉÍÓÚÜÑ\s]', '', texto)
    palabras = texto_limpio.split()
    palabras_filtradas = [palabra for palabra in palabras if palabra not in stop_words]
    return ' '.join(palabras_filtradas)

df_reviews_clean['text_limpio'] = df_reviews_clean['text'].apply(limpiar_texto)
df_reviews_clean = df_reviews_clean.drop(columns=['text'])

# Clasificación del sentimiento
df_reviews_clean['sentimiento'] = pd.cut(df_reviews_clean['rating'], bins=[0, 2, 3, 5], labels=['negativo', 'neutral', 'positivo'], right=True)

nltk.download('punkt')
df_reviews_clean['tokens'] = df_reviews_clean['text_limpio'].apply(lambda x: word_tokenize(str(x).lower()))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vicky\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vicky\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
# División de datos
train_df, test_df = train_test_split(df_reviews_clean, test_size=0.2, random_state=42, stratify=df_reviews_clean['sentimiento'])

# Vectorización TF-IDF
vectorizer = TfidfVectorizer(max_features=10_000)
X_train_tfidf = vectorizer.fit_transform(train_df['text_limpio']).astype('float32')
X_test_tfidf = vectorizer.transform(test_df['text_limpio'])
y_train = train_df['rating']
y_test = test_df['rating']

In [30]:
# Label Encoder
le = LabelEncoder()
le.fit(y_train)

# Ruta de guardado de modelos
model_path = r'C:\Users\Vicky\Documents\ML_Amazon_Reviews_Sentiment_Analysis\models'

In [None]:
# Regresión Logística optimizada con Random Search
lr_model = LogisticRegression(max_iter=500)
param_dist_lr = {
    'C': loguniform(1e-5, 100),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}
random_search_lr = RandomizedSearchCV(
    lr_model, param_distributions=param_dist_lr, n_iter=100, cv=5, scoring='f1_macro', n_jobs=-1, random_state=42
)
random_search_lr.fit(X_train_tfidf, y_train)
best_lr_model = random_search_lr.best_estimator_
y_pred_lr = best_lr_model.predict(X_test_tfidf)

In [None]:
# Visualización de resultados
print("Mejores parámetros encontrados:", random_search_lr.best_params_)
print("Regresión Logística Optimizado - Precisión en Test:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

In [None]:
# Guardar modelo optimizado
model_components_lr = {
    'model': best_lr_model,
    'random_search': random_search_lr,
    'vectorizer': vectorizer,
    'label_encoder': le,
    'metadata': {
        'model_type': 'Optimized Logistic Regression (RandomizedSearchCV)',
        'best_parameters': random_search_lr.best_params_,
        'best_score': random_search_lr.best_score_,
        'test_accuracy': accuracy_score(y_test, y_pred_lr),
        'classification_report': classification_report(y_test, y_pred_lr, output_dict=True),
        'training_date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'features': f"{X_train_tfidf.shape[1]} features TF-IDF",
        'classes': list(le.classes_),
        'cv_results': random_search_lr.cv_results_
    }
}

filename_lr = f"optimized_logistic_regression_random_final{datetime.datetime.now().strftime('%Y%m%d')}.pkl"
full_path_lr = os.path.join(model_path, filename_lr)

with open(full_path_lr, 'wb') as file_lr:
    pickle.dump(model_components_lr, file_lr)

print(f"✅ Modelo de Regresión Logística optimizado guardado exitosamente en:\n{full_path_lr}")
print(f"\n Mejores parámetros: {random_search_lr.best_params_}")
print(f" Mejor score (CV): {random_search_lr.best_score_:.4f}")
print(f" Accuracy (test): {accuracy_score(y_test, y_pred_lr):.4f}")

# Guardar modelo optimizado
model_components_lr = {
    'model': best_lr_model,
    'random_search': random_search_lr,
    'vectorizer': vectorizer,
    'label_encoder': le,
    'metadata': {
        'model_type': 'Optimized Logistic Regression (RandomizedSearchCV)',
        'best_parameters': random_search_lr.best_params_,
        'best_score': random_search_lr.best_score_,
        'test_accuracy': accuracy_score(y_test, y_pred_lr),
        'classification_report': classification_report(y_test, y_pred_lr, output_dict=True),
        'training_date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'features': f"{X_train_tfidf.shape[1]} features TF-IDF",
        'classes': list(le.classes_),
        'cv_results': random_search_lr.cv_results_
    }
}

filename_lr = f"optimized_logistic_regression_random_final{datetime.datetime.now().strftime('%Y%m%d')}.pkl"
full_path_lr = os.path.join(model_path, filename_lr)

with open(full_path_lr, 'wb') as file_lr:
    pickle.dump(model_components_lr, file_lr)

print(f"✅ Modelo de Regresión Logística optimizado guardado exitosamente en:\n{full_path_lr}")
print(f"\n Mejores parámetros: {random_search_lr.best_params_}")
print(f" Mejor score (CV): {random_search_lr.best_score_:.4f}")
print(f" Accuracy (test): {accuracy_score(y_test, y_pred_lr):.4f}")









