<a href="https://colab.research.google.com/github/surabhi-2404/product_recommendation_witn_webscrape/blob/main/DL_final_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip /electronics.zip

Archive:  /electronics.zip
   creating: electronics/
   creating: electronics/.ipynb_checkpoints/
  inflating: electronics/.ipynb_checkpoints/AirConditionerselectronics_dataset_flipkart-checkpoint.csv  
  inflating: electronics/.ipynb_checkpoints/electronics_code-checkpoint.ipynb  
  inflating: electronics/.ipynb_checkpoints/laptop_dataset_flipkart-checkpoint.csv  
  inflating: electronics/.ipynb_checkpoints/recommendation_model-checkpoint.ipynb  
  inflating: electronics/.ipynb_checkpoints/refrigerators_dataset_flipkart-checkpoint.csv  
  inflating: electronics/.ipynb_checkpoints/smartphones_dataset_flipkart-checkpoint.csv  
  inflating: electronics/.ipynb_checkpoints/televisions_dataset_flipkart-checkpoint.csv  
  inflating: electronics/AirConditionerselectronics_dataset_flipkart.csv  
  inflating: electronics/electronics_code.ipynb  
  inflating: electronics/label_encoder.pkl  
  inflating: electronics/laptop_dataset_flipkart.csv  
  inflating: electronics/products_data.pkl  
  infl

# Beauty and Care

 ML+DL (beauty and care)

In [None]:
# (beauty and care)

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier

from collections import Counter
from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df


# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


def compute_weighted_rating(df, m=None):
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df


# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # Select required columns
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # Compute weighted rating
    df_model = compute_weighted_rating(df_model)

    # Check class distribution
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # Balance data
    max_size = max(class_counts.values())
    balanced_dfs = []

    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)

    # Features
    df_balanced['rating_diff'] = (
        df_balanced['overall_rating'] - df_balanced['weighted_rating']
    )

    X = df_balanced[['overall_rating', 'weighted_rating', 'rating_diff']].values

    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # Safe CV
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        y_pred = model.predict(X)
        y_true = y

        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)

        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # Final model
    final_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        class_weight='balanced',
        random_state=42
    )
    final_model.fit(X, y)

    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")
    joblib.dump(scaler, "sentiment_scaler.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le


# ===============================
# AUTOENCODER RECOMMENDER
# ===============================
def train_autoencoder_recommender(df):
    print("\n--- TRAINING AUTOENCODER RECOMMENDER ---")

    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    input_dim = X.shape[1]

    input_layer = Input(shape=(input_dim,))
    encoded = Dense(16, activation='relu')(input_layer)
    encoded = Dense(8, activation='relu')(encoded)
    decoded = Dense(input_dim, activation='linear')(encoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(
    optimizer='adam',
    loss=tf.keras.losses.MeanSquaredError()
      )


    autoencoder.fit(X, X, epochs=30, batch_size=32, verbose=1)
    autoencoder.save("ae_recommender.h5")

    print("✅ Autoencoder recommender trained")


# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df['user_id'] = 0
    df['product_id'] = df.index

    df = df[
        ['product_name', 'product_price', 'sentiment',
         'sentiment_score', 'category', 'overall_rating']
    ].copy()

    df['product_price'] = (
        df['product_price'].astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    df = df.dropna(
        subset=['product_name', 'product_price',
                'sentiment_score', 'overall_rating', 'category']
    )

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")
    joblib.dump(scaler, "scaler.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = (
        df['product_name'].astype(str) + " " + df['category'].astype(str)
    )

    embeddings = model_embed.encode(
        df['text_for_embedding'].tolist(),
        convert_to_tensor=True
    )

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):
    ae_model = tf.keras.models.load_model("ae_recommender.h5")

    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    recon = ae_model.predict(X)
    recon_error = np.mean(np.square(X - recon), axis=1)

    df['ae_score'] = -recon_error
    df = df.sort_values('ae_score', ascending=False)

    input_embedding = model_embed.encode(
        user_input, convert_to_tensor=True
    )
    semantic_scores = cosine_similarity(
        input_embedding.reshape(1, -1), embeddings
    )[0]

    combined_scores = [(i, semantic_scores[i]) for i in range(len(df))]
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")

    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(
            f"Rating: {p['overall_rating']} | "
            f"Sentiment: {p['sentiment']} | "
            f"Price: ₹{p['product_price']}"
        )
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-" * 60)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/beauty and care"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    df['product_price'] = (
    df['product_price']
    .astype(str)
    .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_sentiment_model(df)
    train_autoencoder_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)



Loaded 6 files with 600 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 438, 'neutral': 76, 'negative': 1})

Accuracy: 0.9954
              precision    recall  f1-score   support

    negative       0.99      1.00      0.99       438
     neutral       1.00      0.99      0.99       438
    positive       1.00      1.00      1.00       438

    accuracy                           1.00      1314
   macro avg       1.00      1.00      1.00      1314
weighted avg       1.00      1.00      1.00      1314

✅ Sentiment model trained safely with class balancing.

--- TRAINING AUTOENCODER RECOMMENDER ---
Epoch 1/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: nan   
Epoch 2/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: nan 
Epoch 3/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss:



✅ Autoencoder recommender trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): lipstick




[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

✅ Recommended products similar to 'lipstick':

Product: Lakmé 9 To 7 Matte Bullet Lipstick
Category: Beauty and Care
Rating: 4.1 | Sentiment: positive | Price: ₹832.0
Semantic Similarity Score: 0.6832000017166138
------------------------------------------------------------
Product: MAYBELLINE NEW YORK Color Sensational Creamy Matte Lips...
Category: Beauty and Care
Rating: 4.2 | Sentiment: positive | Price: ₹177.0
Semantic Similarity Score: 0.6762999892234802
------------------------------------------------------------
Product: MAYBELLINE NEW YORK Baby Lips Loves NYC Tinted Lip Balm...
Category: Beauty and Care
Rating: 4.4 | Sentiment: positive | Price: ₹167.0
Semantic Similarity Score: 0.61080002784729
------------------------------------------------------------
Product: SUGAR Cosmetics Tipsy Lip Balm - Long Lasting Moisturiz...
Category: Beauty and Care
Rating: 4.4 | Sentiment: positive | Price: ₹149.0
Semanti



[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

✅ Recommended products similar to 'perfume':

Product: Engage Gift Set Moments Perfume, Long Lasting Fragrance...
Category: Beauty and Care
Rating: 4.1 | Sentiment: positive | Price: ₹246.0
Semantic Similarity Score: 0.7186999917030334
------------------------------------------------------------
Product: FOGG Fresh Citrus Premium Perfume Scent With Long Lasti...
Category: Beauty and Care
Rating: 4.1 | Sentiment: positive | Price: ₹199.0
Semantic Similarity Score: 0.6938999891281128
------------------------------------------------------------
Product: FOGG Xtremo and Perfume Scent With Long Lasting Eau de ...
Category: Beauty and Care
Rating: 4.2 | Sentiment: positive | Price: ₹526.0
Semantic Similarity Score: 0.6866999864578247
------------------------------------------------------------
Product: FOGG Impressio Perfume Scent With Long Lasting II� Eau ...
Category: Beauty and Care
Rating: 4.2 | Sentiment: positiv

# DL (Autoencoder) only


In [None]:
# ===============================
# BEAUTY & CARE – DL ONLY VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    X = df[['overall_rating']].values
    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(1,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# AUTOENCODER RECOMMENDER (DL)
# ===============================
def train_autoencoder_recommender(df):
    print("\n--- TRAINING AUTOENCODER RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    input_layer = Input(shape=(3,))
    encoded = Dense(16, activation='relu')(input_layer)
    encoded = Dense(8, activation='relu')(encoded)
    decoded = Dense(3, activation='linear')(encoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    autoencoder.fit(X, X, epochs=30, batch_size=32, verbose=1)
    autoencoder.save("ae_recommender.h5")

    print("✅ Autoencoder trained")


# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    scores = cosine_similarity(query_emb.reshape(1, -1), embeddings)[0]

    top_idx = np.argsort(scores)[::-1][:top_n]

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.iloc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/beauty and care"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_autoencoder_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 600 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(171.66666666666666), 1: np.float64(2.258771929824561), 2: np.float64(0.3919330289193303)}
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1287 - loss: 1.2689    
Epoch 2/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8372 - loss: 1.2067 
Epoch 3/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8395 - loss: 1.1819 
Epoch 4/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8420 - loss: 1.8351
Epoch 5/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8470 - loss: 0.7478 
Epoch 6/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8545 - loss: 1.4492
Epoch 7/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8662 - loss: 0.7172 
Epoch 8/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8430 - loss: 0.8732 
Epoch 9/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37




📊 DL Classification Report (Balanced):

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       1.00      0.05      0.10        76
    positive       0.86      1.00      0.92       438

    accuracy                           0.86       515
   macro avg       0.95      0.68      0.67       515
weighted avg       0.88      0.86      0.80       515

✅ DL sentiment model trained with class balancing

--- TRAINING AUTOENCODER RECOMMENDER ---
Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4882   
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.2901 
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1903 
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1353 
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 



✅ Autoencoder trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): lipstick

✅ Recommendations for 'lipstick':

Product: Lakmé 9 To 7 Matte Bullet Lipstick
Category: Beauty and Care
Rating: 4.1 | Price: ₹832.0
--------------------------------------------------
Product: MAYBELLINE NEW YORK Color Sensational Creamy Matte Lips...
Category: Beauty and Care
Rating: 4.2 | Price: ₹177.0
--------------------------------------------------
Product: MAYBELLINE NEW YORK Baby Lips Loves NYC Tinted Lip Balm...
Category: Beauty and Care
Rating: 4.4 | Price: ₹167.0
--------------------------------------------------
Product: SUGAR Cosmetics Tipsy Lip Balm - Long Lasting Moisturiz...
Category: Beauty and Care
Rating: 4.4 | Price: ₹149.0
--------------------------------------------------
Product: SUGAR Cosmetics Tipsy Lip Balm - Long Lasting Moisturiz...
Category: Beauty and Care
Rating: 4.3 | Price: ₹159.0
--------------------------------------------------

Enter product name (or exit): perfume

✅ Recommendatio

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# LSTM+Attention

In [None]:
# ===============================
# BEAUTY & CARE – DL ONLY VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Attention, Reshape, Flatten


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    X = df[['overall_rating']].values
    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(1,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# LSTM + ATTENTION RECOMMENDER (DL)
# ===============================
def train_lstm_attention_recommender(df):
    print("\n--- TRAINING LSTM + ATTENTION RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    # 🔹 Feature matrix
    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    # 🔹 Convert to sequence format (timesteps = features)
    # Shape: (samples, timesteps, features_per_step)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 INPUT
    input_layer = Input(shape=(X_seq.shape[1], 1))

    # 🔹 LSTM
    lstm_out = LSTM(64, return_sequences=True)(input_layer)

    # 🔹 ATTENTION
    attention_out = Attention()([lstm_out, lstm_out])

    # 🔹 FLATTEN
    flat = Flatten()(attention_out)

    # 🔹 OUTPUT (reconstruct input features)
    output_layer = Dense(X.shape[1], activation='linear')(flat)

    # 🔹 MODEL
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    model.fit(
        X_seq,
        X,
        epochs=30,
        batch_size=32,
        verbose=1
    )

    model.save("lstm_attention_recommender.h5")

    print("✅ LSTM + Attention recommender trained")

# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    scores = cosine_similarity(query_emb.reshape(1, -1), embeddings)[0]

    top_idx = np.argsort(scores)[::-1][:top_n]

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.iloc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/beauty and care"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_lstm_attention_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 600 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(171.66666666666666), 1: np.float64(2.258771929824561), 2: np.float64(0.3919330289193303)}
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.1083 - loss: 1.6070       
Epoch 2/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8444 - loss: 1.0644 
Epoch 3/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8622 - loss: 1.1168 
Epoch 4/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8656 - loss: 1.4461 
Epoch 5/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8457 - loss: 0.9226 
Epoch 6/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8740 - loss: 1.0440 
Epoch 7/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8603 - loss: 0.7332 
Epoch 8/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8508 - loss: 0.7668  
Epoch 9/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[




📊 DL Classification Report (Balanced):

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       1.00      0.04      0.08        76
    positive       0.86      1.00      0.92       438

    accuracy                           0.86       515
   macro avg       0.95      0.68      0.67       515
weighted avg       0.88      0.86      0.80       515

✅ DL sentiment model trained with class balancing

--- TRAINING LSTM + ATTENTION RECOMMENDER ---
Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.4426
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.2352
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0397 
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0124 
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step



✅ LSTM + Attention recommender trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): grooming

✅ Recommendations for 'grooming':

Product: TRU HAIR & SKIN All-in-One Mens Grooming Set | Hair Wax...
Category: Beauty and Care
Rating: 4.0 | Price: ₹450.0
--------------------------------------------------
Product: Wild Stone Edge Grooming Kit for Men | 7 IN 1 Gifting C...
Category: Beauty and Care
Rating: 4.6 | Price: ₹549.0
--------------------------------------------------
Product: VETONI Men's Grooming Kit
Category: Beauty and Care
Rating: 4.3 | Price: ₹234.0
--------------------------------------------------
Product: DENVER Blackcode Men's Grooming Kit
Category: Beauty and Care
Rating: 4.1 | Price: ₹334.0
--------------------------------------------------
Product: Yardley London Gentleman Luxury Grooming Kit- Elite Gif...
Category: Beauty and Care
Rating: 4.3 | Price: ₹356.0
--------------------------------------------------

Enter product name (or exit): trimmer

✅ Recommendations for 'trimmer':

Product: Flem

# CNN + Attention DL

In [None]:
# ===============================
# BEAUTY & CARE – DL ONLY VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Attention, Reshape


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)

    X = df[['overall_rating', 'neutral_distance']].values

    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(2,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# CNN + ATTENTION RECOMMENDER (DL)
# ===============================
def train_cnn_attention_recommender(df):
    print("\n--- TRAINING CNN + ATTENTION RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    # 🔹 Feature matrix
    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)

    X = df[['overall_rating', 'neutral_distance']].values

    X = MinMaxScaler().fit_transform(X)

    # 🔹 Reshape for CNN
    # Shape: (samples, timesteps, channels)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 INPUT
    input_layer = Input(shape=(X_seq.shape[1], 1))

    # 🔹 CNN layers
    conv1 = Conv1D(filters=64, kernel_size=2, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=32, kernel_size=2, activation='relu', padding='same')(conv1)

    # 🔹 ATTENTION
    attention_out = Attention()([conv2, conv2])

    # 🔹 POOLING
    pooled = GlobalAveragePooling1D()(attention_out)

    # 🔹 OUTPUT (reconstruction)
    output_layer = Dense(X.shape[1], activation='linear')(pooled)

    # 🔹 MODEL
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    model.fit(
        X_seq,
        X,
        epochs=30,
        batch_size=32,
        verbose=1
    )

    model.save("cnn_attention_recommender.h5")

    print("✅ CNN + Attention recommender trained")

# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # 🔹 Load CNN model
    cnn_model = tf.keras.models.load_model("cnn_attention_recommender.h5")

    # 🔹 Prepare features
    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)
    X = df[['overall_rating', 'neutral_distance']].values
    X = MinMaxScaler().fit_transform(X)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 CNN reconstruction error
    recon = cnn_model.predict(X_seq, verbose=0)
    recon_error = np.mean(np.square(X - recon), axis=1)
    df['dl_score'] = -recon_error

    # 🔹 Semantic similarity
    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(
        query_emb.reshape(1, -1), embeddings
    )[0]

    # 🔹 Final hybrid score
    df['final_score'] = 0.7 * semantic_scores + 0.3 * df['dl_score'].values

    top_idx = df['final_score'].nlargest(top_n).index

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.loc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/beauty and care"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_cnn_attention_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 600 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(171.66666666666666), 1: np.float64(2.258771929824561), 2: np.float64(0.3919330289193303)}
Epoch 1/30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['neutral_distance'] = np.abs(df['overall_rating'] - 3)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8618 - loss: 0.7561   
Epoch 2/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8627 - loss: 1.8290 
Epoch 3/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8367 - loss: 0.8720 
Epoch 4/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8514 - loss: 0.7108 
Epoch 5/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8510 - loss: 0.7396 
Epoch 6/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8413 - loss: 1.0800 
Epoch 7/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8446 - loss: 1.4021 
Epoch 8/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8517 - loss: 0.9077 
Epoch 9/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3



              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       1.00      0.29      0.45        76
    positive       0.89      1.00      0.94       438

    accuracy                           0.90       515
   macro avg       0.96      0.76      0.80       515
weighted avg       0.91      0.90      0.87       515

✅ DL sentiment model trained with class balancing

--- TRAINING CNN + ATTENTION RECOMMENDER ---
Epoch 1/30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['neutral_distance'] = np.abs(df['overall_rating'] - 3)


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.3643   
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0896 
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0041 
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0027 
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9.1294e-04 
Epoch 6/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9.9215e-04
Epoch 7/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 5.1698e-04 
Epoch 8/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 4.2753e-04 
Epoch 9/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 5.2564e-04 
Epoch 10/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - los



✅ CNN + Attention recommender trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): eyeshadow





✅ Recommendations for 'eyeshadow':

Product: MINARA 60 Color Matte & Shimmery Pigment EyeShadow 170 ...
Category: Beauty and Care
Rating: 4.0 | Price: ₹189.0
--------------------------------------------------
Product: Hezruck 18 colour Nude Beauty Eyeshadow Platte
Category: Beauty and Care
Rating: 3.6 | Price: ₹153.0
--------------------------------------------------
Product: NYN Makeup Kit - Eye-Shadows, Lip Colors, Blushes, Spon...
Category: Beauty and Care
Rating: 3.7 | Price: ₹165.0
--------------------------------------------------
Product: NYN Makeup Kit - Eye-Shadows, Lip Colors, Blushes, Spon...
Category: Beauty and Care
Rating: 3.9 | Price: ₹164.0
--------------------------------------------------
Product: FACES CANADA Magneteyes Eye Makeup Combo - Black Eyelin...
Category: Beauty and Care
Rating: 4.3 | Price: ₹467.0
--------------------------------------------------

Enter product name (or exit): compact





✅ Recommendations for 'compact':

Product: Engage XX1 Cologne Eau de Cologne - 135 ml
Category: Beauty and Care
Rating: 4.2 | Price: ₹279.0
--------------------------------------------------
Product: Secret Temptation romance Eau de Parfum - 50 ml
Category: Beauty and Care
Rating: 4.2 | Price: ₹179.0
--------------------------------------------------
Product: Bath and Body Works DARK KISS COLLECTION
Category: Beauty and Care
Rating: 4.7 | Price: ₹5311.0
--------------------------------------------------
Product: Minimalist Dry_Skincare_Kit
Category: Beauty and Care
Rating: 4.2 | Price: ₹1197.0
--------------------------------------------------
Product: Iba Pure Skin Perfect Look Long-Wear Mattifying Compact...
Category: Beauty and Care
Rating: 3.9 | Price: ₹370.0
--------------------------------------------------

Enter product name (or exit): exit


# Electronics

In [None]:
# ===============================
# ELECTRONICS – AUTOENCODER VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    X = df[['overall_rating']].values
    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(1,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# AUTOENCODER RECOMMENDER (DL)
# ===============================
def train_autoencoder_recommender(df):
    print("\n--- TRAINING AUTOENCODER RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    input_layer = Input(shape=(3,))
    encoded = Dense(16, activation='relu')(input_layer)
    encoded = Dense(8, activation='relu')(encoded)
    decoded = Dense(3, activation='linear')(encoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    autoencoder.fit(X, X, epochs=30, batch_size=32, verbose=1)
    autoencoder.save("ae_recommender.h5")

    print("✅ Autoencoder trained")


# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    scores = cosine_similarity(query_emb.reshape(1, -1), embeddings)[0]

    top_idx = np.argsort(scores)[::-1][:top_n]

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.iloc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/electronics"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_autoencoder_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 500 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(166.66666666666666), 1: np.float64(3.9682539682539684), 2: np.float64(0.36469730123997085)}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0000e+00 - loss: 1.2709
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0000e+00 - loss: 0.8887 
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1448 - loss: 0.7710     
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9084 - loss: 1.0018 
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9559 - loss: 0.7427 
Epoch 6/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9325 - loss: 1.3383 
Epoch 7/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9225 - loss: 1.0047 
Epoch 8/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9368 - loss: 0.9155 
Epoch 9/30
[1m16/16[0m [32m━━━━━━━



              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       1.00      0.10      0.17        42
    positive       0.92      1.00      0.96       457

    accuracy                           0.92       500
   macro avg       0.97      0.70      0.71       500
weighted avg       0.93      0.92      0.89       500

✅ DL sentiment model trained with class balancing

--- TRAINING AUTOENCODER RECOMMENDER ---
Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.5180   
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.4429 
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.4047 
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.3505 
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.2996 
Epoch 6/30
[1m16/16[0m 



✅ Autoencoder trained

--- PREPARING RECOMMENDATION DATA ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]



vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Recommendation data ready

Enter product name (or exit): tv

✅ Recommendations for 'tv':

Product: Add to Compare
BESTON 80 cm (32 inch) HD Ready LED Smart Android TV 2025 Edition
3.94,925 Ratings & 616 Reviews
HD Ready | LED
Model ID: BS32HD1
Launch Year: 2025
Total Sound Output: 20 W
2 Year warranty on Product , Onsite Warranty on Product and No Delivery Damages Covered
₹7,359
₹17,99959% off
Hot Deal
Upto 
₹1,900
 Off on Exchange
Category: Electronics
Rating: 3.9 | Price: ₹7359
--------------------------------------------------
Product: Add to Compare
iFFALCON by TCL S55 80 cm (32 inch) HD Ready LED Smart Google TV 2025 Edition with HDR 10 | 16W Dolby ...
4.16,693 Ratings & 443 Reviews
HD Ready | LED
Model ID: 32S55
Launch Year: 2025
Total Sound Output: 16 W
1 Year Product Warranty
₹8,799
₹19,99055% off
Upto 
₹1,900
 Off on Exchange
Bank Offer
Category: Electronics
Rating: 4.1 | Price: ₹8799
--------------------------------------------------
Product: Add to Compare
KODAK Special Ed

In [None]:
# ===============================
# ELECTRONICS – LSTM VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Attention, Reshape, Flatten


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    X = df[['overall_rating']].values
    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(1,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# LSTM + ATTENTION RECOMMENDER (DL)
# ===============================
def train_lstm_attention_recommender(df):
    print("\n--- TRAINING LSTM + ATTENTION RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    # 🔹 Feature matrix
    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    # 🔹 Convert to sequence format (timesteps = features)
    # Shape: (samples, timesteps, features_per_step)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 INPUT
    input_layer = Input(shape=(X_seq.shape[1], 1))

    # 🔹 LSTM
    lstm_out = LSTM(64, return_sequences=True)(input_layer)

    # 🔹 ATTENTION
    attention_out = Attention()([lstm_out, lstm_out])

    # 🔹 FLATTEN
    flat = Flatten()(attention_out)

    # 🔹 OUTPUT (reconstruct input features)
    output_layer = Dense(X.shape[1], activation='linear')(flat)

    # 🔹 MODEL
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    model.fit(
        X_seq,
        X,
        epochs=30,
        batch_size=32,
        verbose=1
    )

    model.save("lstm_attention_recommender.h5")

    print("✅ LSTM + Attention recommender trained")

# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    scores = cosine_similarity(query_emb.reshape(1, -1), embeddings)[0]

    top_idx = np.argsort(scores)[::-1][:top_n]

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.iloc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/beauty and care"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_lstm_attention_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 600 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(171.66666666666666), 1: np.float64(2.258771929824561), 2: np.float64(0.3919330289193303)}
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.1157 - loss: 0.8671    
Epoch 2/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8531 - loss: 1.1773 
Epoch 3/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8392 - loss: 1.5377 
Epoch 4/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8563 - loss: 1.0599 
Epoch 5/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8450 - loss: 0.8491 
Epoch 6/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8327 - loss: 1.1368 
Epoch 7/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8396 - loss: 0.8565 
Epoch 8/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8530 - loss: 0.7471 
Epoch 9/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[




📊 DL Classification Report (Balanced):

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       0.00      0.00      0.00        76
    positive       0.85      1.00      0.92       438

    accuracy                           0.85       515
   macro avg       0.62      0.67      0.64       515
weighted avg       0.73      0.85      0.78       515

✅ DL sentiment model trained with class balancing

--- TRAINING LSTM + ATTENTION RECOMMENDER ---
Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 0.4471
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.2509
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0572
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0131
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 



✅ LSTM + Attention recommender trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): exit


In [None]:
# ===============================
# ELECTRONICS - CNN VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Attention, Reshape


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)

    X = df[['overall_rating', 'neutral_distance']].values

    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(2,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# CNN + ATTENTION RECOMMENDER (DL)
# ===============================
def train_cnn_attention_recommender(df):
    print("\n--- TRAINING CNN + ATTENTION RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    # 🔹 Feature matrix
    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)

    X = df[['overall_rating', 'neutral_distance']].values

    X = MinMaxScaler().fit_transform(X)

    # 🔹 Reshape for CNN
    # Shape: (samples, timesteps, channels)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 INPUT
    input_layer = Input(shape=(X_seq.shape[1], 1))

    # 🔹 CNN layers
    conv1 = Conv1D(filters=64, kernel_size=2, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=32, kernel_size=2, activation='relu', padding='same')(conv1)

    # 🔹 ATTENTION
    attention_out = Attention()([conv2, conv2])

    # 🔹 POOLING
    pooled = GlobalAveragePooling1D()(attention_out)

    # 🔹 OUTPUT (reconstruction)
    output_layer = Dense(X.shape[1], activation='linear')(pooled)

    # 🔹 MODEL
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    model.fit(
        X_seq,
        X,
        epochs=30,
        batch_size=32,
        verbose=1
    )

    model.save("cnn_attention_recommender.h5")

    print("✅ CNN + Attention recommender trained")

# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # 🔹 Load CNN model
    cnn_model = tf.keras.models.load_model("cnn_attention_recommender.h5")

    # 🔹 Prepare features
    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)
    X = df[['overall_rating', 'neutral_distance']].values
    X = MinMaxScaler().fit_transform(X)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 CNN reconstruction error
    recon = cnn_model.predict(X_seq, verbose=0)
    recon_error = np.mean(np.square(X - recon), axis=1)
    df['dl_score'] = -recon_error

    # 🔹 Semantic similarity
    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(
        query_emb.reshape(1, -1), embeddings
    )[0]

    # 🔹 Final hybrid score
    df['final_score'] = 0.7 * semantic_scores + 0.3 * df['dl_score'].values

    top_idx = df['final_score'].nlargest(top_n).index

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.loc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/electronics"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_cnn_attention_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 500 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(166.66666666666666), 1: np.float64(3.9682539682539684), 2: np.float64(0.36469730123997085)}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9155 - loss: 0.8190
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8958 - loss: 1.5851
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.9299 - loss: 1.4120
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8951 - loss: 0.9100
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9215 - loss: 1.1203
Epoch 6/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9302 - loss: 0.9098
Epoch 7/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9234 - loss: 0.9887
Epoch 8/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9128 - loss: 1.2155
Epoch 9/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━




📊 DL Classification Report (Balanced):

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       1.00      1.00      1.00        42
    positive       1.00      1.00      1.00       457

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500

✅ DL sentiment model trained with class balancing

--- TRAINING CNN + ATTENTION RECOMMENDER ---
Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.4402
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.2346
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0464
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0077
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - 



✅ CNN + Attention recommender trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


✅ Recommendation data ready

Enter product name (or exit): exit


# Footwear

In [None]:
# ===============================
# FOOTWEAR – AUTOENCODER VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    X = df[['overall_rating']].values
    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(1,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# AUTOENCODER RECOMMENDER (DL)
# ===============================
def train_autoencoder_recommender(df):
    print("\n--- TRAINING AUTOENCODER RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    input_layer = Input(shape=(3,))
    encoded = Dense(16, activation='relu')(input_layer)
    encoded = Dense(8, activation='relu')(encoded)
    decoded = Dense(3, activation='linear')(encoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    autoencoder.fit(X, X, epochs=30, batch_size=32, verbose=1)
    autoencoder.save("ae_recommender.h5")

    print("✅ Autoencoder trained")


# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    scores = cosine_similarity(query_emb.reshape(1, -1), embeddings)[0]

    top_idx = np.argsort(scores)[::-1][:top_n]

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.iloc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/footwear"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_autoencoder_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 271 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(29.666666666666668), 1: np.float64(0.9175257731958762), 2: np.float64(0.5329341317365269)}
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.0000e+00 - loss: 1.0390
Epoch 2/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5668 - loss: 1.1239 
Epoch 3/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6394 - loss: 1.1798 
Epoch 4/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6326 - loss: 1.2270 
Epoch 5/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5780 - loss: 1.2574 
Epoch 6/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6423 - loss: 1.0204 
Epoch 7/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6361 - loss: 1.0794 
Epoch 8/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6438 - loss: 1.0863 
Epoch 9/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m




📊 DL Classification Report (Balanced):

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         3
     neutral       0.00      0.00      0.00        97
    positive       0.63      1.00      0.77       167

    accuracy                           0.64       267
   macro avg       0.54      0.67      0.59       267
weighted avg       0.41      0.64      0.50       267

✅ DL sentiment model trained with class balancing

--- TRAINING AUTOENCODER RECOMMENDER ---
Epoch 1/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.1001
Epoch 2/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0552 
Epoch 3/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0259 
Epoch 4/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0081 
Epoch 5/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.004



✅ Autoencoder trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): exit


In [None]:
# ===============================
# FOOTWEAR – LSTM VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Attention, Reshape, Flatten


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    X = df[['overall_rating']].values
    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(1,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# LSTM + ATTENTION RECOMMENDER (DL)
# ===============================
def train_lstm_attention_recommender(df):
    print("\n--- TRAINING LSTM + ATTENTION RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    # 🔹 Feature matrix
    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    # 🔹 Convert to sequence format (timesteps = features)
    # Shape: (samples, timesteps, features_per_step)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 INPUT
    input_layer = Input(shape=(X_seq.shape[1], 1))

    # 🔹 LSTM
    lstm_out = LSTM(64, return_sequences=True)(input_layer)

    # 🔹 ATTENTION
    attention_out = Attention()([lstm_out, lstm_out])

    # 🔹 FLATTEN
    flat = Flatten()(attention_out)

    # 🔹 OUTPUT (reconstruct input features)
    output_layer = Dense(X.shape[1], activation='linear')(flat)

    # 🔹 MODEL
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    model.fit(
        X_seq,
        X,
        epochs=30,
        batch_size=32,
        verbose=1
    )

    model.save("lstm_attention_recommender.h5")

    print("✅ LSTM + Attention recommender trained")

# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    scores = cosine_similarity(query_emb.reshape(1, -1), embeddings)[0]

    top_idx = np.argsort(scores)[::-1][:top_n]

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.iloc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/footwear"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_lstm_attention_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 271 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(29.666666666666668), 1: np.float64(0.9175257731958762), 2: np.float64(0.5329341317365269)}
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0061 - loss: 1.2230   
Epoch 2/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0000e+00 - loss: 0.9476 
Epoch 3/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0510 - loss: 1.0644     
Epoch 4/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6988 - loss: 1.1381 
Epoch 5/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6079 - loss: 1.0162 
Epoch 6/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6331 - loss: 1.1887 
Epoch 7/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6237 - loss: 1.1611 
Epoch 8/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6375 - loss: 1.1759 
Epoch 9/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1




📊 DL Classification Report (Balanced):

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         3
     neutral       1.00      0.15      0.27        97
    positive       0.67      1.00      0.80       167

    accuracy                           0.69       267
   macro avg       0.89      0.72      0.69       267
weighted avg       0.79      0.69      0.61       267

✅ DL sentiment model trained with class balancing

--- TRAINING LSTM + ATTENTION RECOMMENDER ---
Epoch 1/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 0.3520
Epoch 2/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.2557 
Epoch 3/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.1685 
Epoch 4/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0801
Epoch 5/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss:



✅ LSTM + Attention recommender trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): exit


In [None]:
# ===============================
# FOOTWEAR - CNN VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Attention, Reshape


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)

    X = df[['overall_rating', 'neutral_distance']].values

    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(2,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# CNN + ATTENTION RECOMMENDER (DL)
# ===============================
def train_cnn_attention_recommender(df):
    print("\n--- TRAINING CNN + ATTENTION RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    # 🔹 Feature matrix
    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)

    X = df[['overall_rating', 'neutral_distance']].values

    X = MinMaxScaler().fit_transform(X)

    # 🔹 Reshape for CNN
    # Shape: (samples, timesteps, channels)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 INPUT
    input_layer = Input(shape=(X_seq.shape[1], 1))

    # 🔹 CNN layers
    conv1 = Conv1D(filters=64, kernel_size=2, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=32, kernel_size=2, activation='relu', padding='same')(conv1)

    # 🔹 ATTENTION
    attention_out = Attention()([conv2, conv2])

    # 🔹 POOLING
    pooled = GlobalAveragePooling1D()(attention_out)

    # 🔹 OUTPUT (reconstruction)
    output_layer = Dense(X.shape[1], activation='linear')(pooled)

    # 🔹 MODEL
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    model.fit(
        X_seq,
        X,
        epochs=30,
        batch_size=32,
        verbose=1
    )

    model.save("cnn_attention_recommender.h5")

    print("✅ CNN + Attention recommender trained")

# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # 🔹 Load CNN model
    cnn_model = tf.keras.models.load_model("cnn_attention_recommender.h5")

    # 🔹 Prepare features
    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)
    X = df[['overall_rating', 'neutral_distance']].values
    X = MinMaxScaler().fit_transform(X)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 CNN reconstruction error
    recon = cnn_model.predict(X_seq, verbose=0)
    recon_error = np.mean(np.square(X - recon), axis=1)
    df['dl_score'] = -recon_error

    # 🔹 Semantic similarity
    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(
        query_emb.reshape(1, -1), embeddings
    )[0]

    # 🔹 Final hybrid score
    df['final_score'] = 0.7 * semantic_scores + 0.3 * df['dl_score'].values

    top_idx = df['final_score'].nlargest(top_n).index

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.loc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/footwear"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_cnn_attention_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 271 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(29.666666666666668), 1: np.float64(0.9175257731958762), 2: np.float64(0.5329341317365269)}
Epoch 1/30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['neutral_distance'] = np.abs(df['overall_rating'] - 3)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.2485 - loss: 1.2653
Epoch 2/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6733 - loss: 1.0656 
Epoch 3/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6525 - loss: 1.2623 
Epoch 4/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6686 - loss: 1.1339 
Epoch 5/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6662 - loss: 0.9357 
Epoch 6/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6435 - loss: 1.0120 
Epoch 7/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6799 - loss: 0.8998 
Epoch 8/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6877 - loss: 0.9215 
Epoch 9/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms




📊 DL Classification Report (Balanced):

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         3
     neutral       1.00      0.71      0.83        97
    positive       0.86      1.00      0.92       167

    accuracy                           0.90       267
   macro avg       0.95      0.90      0.92       267
weighted avg       0.91      0.90      0.89       267

✅ DL sentiment model trained with class balancing

--- TRAINING CNN + ATTENTION RECOMMENDER ---
Epoch 1/30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['neutral_distance'] = np.abs(df['overall_rating'] - 3)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.2959
Epoch 2/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.2048 
Epoch 3/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.1104 
Epoch 4/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0320 
Epoch 5/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0046 
Epoch 6/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0080 
Epoch 7/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0039 
Epoch 8/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0039 
Epoch 9/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0027 
Epoch 10/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0027 
Epoch 11/30
[1m9/9[0m [32m━━



✅ CNN + Attention recommender trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): exit


# WEARABLES

In [None]:
# ===============================
# WEARABLES – AUTOENCODER VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    X = df[['overall_rating']].values
    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(1,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# AUTOENCODER RECOMMENDER (DL)
# ===============================
def train_autoencoder_recommender(df):
    print("\n--- TRAINING AUTOENCODER RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    input_layer = Input(shape=(3,))
    encoded = Dense(16, activation='relu')(input_layer)
    encoded = Dense(8, activation='relu')(encoded)
    decoded = Dense(3, activation='linear')(encoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    autoencoder.fit(X, X, epochs=30, batch_size=32, verbose=1)
    autoencoder.save("ae_recommender.h5")

    print("✅ Autoencoder trained")


# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    scores = cosine_similarity(query_emb.reshape(1, -1), embeddings)[0]

    top_idx = np.argsort(scores)[::-1][:top_n]

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.iloc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/Wearables"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_autoencoder_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 150 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(7.611111111111111), 1: np.float64(0.9513888888888888), 2: np.float64(0.5502008032128514)}
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6099 - loss: 0.9503
Epoch 2/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6112 - loss: 1.0689 
Epoch 3/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6158 - loss: 0.9407 
Epoch 4/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6059 - loss: 1.0880 
Epoch 5/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5815 - loss: 1.1362
Epoch 6/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5707 - loss: 1.0293 
Epoch 7/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5920 - loss: 0.9153 
Epoch 8/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6237 - loss: 1.0680 
Epoch 9/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms




📊 DL Classification Report (Balanced):

              precision    recall  f1-score   support

    negative       1.00      0.33      0.50         6
     neutral       0.00      0.00      0.00        48
    positive       0.63      1.00      0.78        83

    accuracy                           0.62       137
   macro avg       0.54      0.44      0.43       137
weighted avg       0.43      0.62      0.49       137

✅ DL sentiment model trained with class balancing

--- TRAINING AUTOENCODER RECOMMENDER ---
Epoch 1/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.4539  
Epoch 2/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.4226 
Epoch 3/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.3849 
Epoch 4/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.3456 
Epoch 5/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.3



✅ Autoencoder trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): watch

✅ Recommendations for 'watch':

Product: 1. ZEBRONICS Zeb-Blitz with DO...
4174 Ratings&25 Reviews
₹2,29961% off
Category: Wearables
Rating: 4.0 | Price: ₹2299.0
--------------------------------------------------
Product: 5. beatXP Unbound Nova 1.96” A...
4.21,709 Ratings&118 Reviews
₹3,29958% off
Category: Wearables
Rating: 4.2 | Price: ₹3299.0
--------------------------------------------------
Product: 2. boAt Lunar Discovery w/ Tur...
432,754 Ratings&1,787 Reviews
₹1,49982% off
Category: Wearables
Rating: 4.0 | Price: ₹1499.0
--------------------------------------------------
Product: 5. Noise Pulse 3 1.96'' Displa...
4.177,586 Ratings&4,075 Reviews
₹1,09984% off
Category: Wearables
Rating: 4.1 | Price: ₹1099.0
--------------------------------------------------
Product: 3. boAt Storm call 3 w/TBT Nav...
4.14,21,649 Ratings&23,333 Reviews
₹1,39983% off
Category: Wearables
Rating: 4.1 | Price: ₹1399.0
------------------

In [None]:
# ===============================
# WEARABLES – LSTM VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Attention, Reshape, Flatten


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    X = df[['overall_rating']].values
    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(1,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# LSTM + ATTENTION RECOMMENDER (DL)
# ===============================
def train_lstm_attention_recommender(df):
    print("\n--- TRAINING LSTM + ATTENTION RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    # 🔹 Feature matrix
    X = df[['overall_rating', 'sentiment_score', 'product_price']].values
    X = MinMaxScaler().fit_transform(X)

    # 🔹 Convert to sequence format (timesteps = features)
    # Shape: (samples, timesteps, features_per_step)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 INPUT
    input_layer = Input(shape=(X_seq.shape[1], 1))

    # 🔹 LSTM
    lstm_out = LSTM(64, return_sequences=True)(input_layer)

    # 🔹 ATTENTION
    attention_out = Attention()([lstm_out, lstm_out])

    # 🔹 FLATTEN
    flat = Flatten()(attention_out)

    # 🔹 OUTPUT (reconstruct input features)
    output_layer = Dense(X.shape[1], activation='linear')(flat)

    # 🔹 MODEL
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    model.fit(
        X_seq,
        X,
        epochs=30,
        batch_size=32,
        verbose=1
    )

    model.save("lstm_attention_recommender.h5")

    print("✅ LSTM + Attention recommender trained")

# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    scores = cosine_similarity(query_emb.reshape(1, -1), embeddings)[0]

    top_idx = np.argsort(scores)[::-1][:top_n]

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.iloc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/Wearables"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_lstm_attention_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 150 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(7.611111111111111), 1: np.float64(0.9513888888888888), 2: np.float64(0.5502008032128514)}
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6190 - loss: 1.0487
Epoch 2/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5986 - loss: 1.0841 
Epoch 3/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5995 - loss: 0.9763 
Epoch 4/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6290 - loss: 1.1514 
Epoch 5/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6006 - loss: 1.0579
Epoch 6/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6106 - loss: 1.0680 
Epoch 7/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6106 - loss: 1.0407 
Epoch 8/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5772 - loss: 1.2149 
Epoch 9/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms



[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 50ms/step



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step

📊 DL Classification Report (Balanced):





              precision    recall  f1-score   support

    negative       1.00      0.17      0.29         6
     neutral       0.00      0.00      0.00        48
    positive       0.61      1.00      0.76        83

    accuracy                           0.61       137
   macro avg       0.54      0.39      0.35       137
weighted avg       0.42      0.61      0.47       137

✅ DL sentiment model trained with class balancing

--- TRAINING LSTM + ATTENTION RECOMMENDER ---
Epoch 1/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.4455
Epoch 2/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.3648
Epoch 3/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.3106
Epoch 4/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.2522
Epoch 5/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.1914
Epoch 6/30
[1m5/5[0m [32m━━━━━



✅ LSTM + Attention recommender trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): exit


In [None]:
# ===============================
# WEARABLES - CNN VERSION
# ===============================

import pandas as pd
import numpy as np
import os
import glob
import joblib

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Attention, Reshape


# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if not all_files:
        raise Exception("No CSV files found!")

    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    print(f"Loaded {len(df)} records")
    return df


# ===============================
# SENTIMENT PREPROCESSING
# ===============================
def fill_and_encode_sentiment(df):

    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'neutral'
        elif r >= 4:
            return 'positive'
        elif r >= 3:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(
        df['overall_rating'].apply(rating_to_sentiment)
    )

    df['sentiment'] = df['sentiment'].str.lower().str.strip()

    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df


# ===============================
# DL SENTIMENT CLASSIFIER
# ===============================
def train_dl_sentiment_model(df):
    print("\n--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score'])

    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)

    X = df[['overall_rating', 'neutral_distance']].values

    y = df['sentiment_score'].values

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    y_cat = tf.keras.utils.to_categorical(y, num_classes=3)

    # 🔹 COMPUTE CLASS WEIGHTS
    class_counts = np.bincount(y)
    total = len(y)

    class_weights = {
        i: total / (3 * count) for i, count in enumerate(class_counts)
    }

    print("Class weights:", class_weights)

    # 🔹 DL MODEL
    model = tf.keras.Sequential([
        Dense(32, activation='relu', input_shape=(2,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
        metrics=['accuracy']
    )

    model.fit(
        X,
        y_cat,
        epochs=30,
        batch_size=32,
        class_weight=class_weights,
        verbose=1
    )

    # 🔹 EVALUATION
    y_pred = model.predict(X)
    y_pred_labels = np.argmax(y_pred, axis=1)

    print("\n📊 DL Classification Report (Balanced):\n")
    print(classification_report(
        y,
        y_pred_labels,
        target_names=['negative', 'neutral', 'positive'],
        zero_division=0
    ))

    model.save("dl_sentiment_model.h5")
    joblib.dump(scaler, "dl_sentiment_scaler.pkl")

    print("✅ DL sentiment model trained with class balancing")


# ===============================
# CNN + ATTENTION RECOMMENDER (DL)
# ===============================
def train_cnn_attention_recommender(df):
    print("\n--- TRAINING CNN + ATTENTION RECOMMENDER ---")

    df = df.dropna(subset=['overall_rating', 'sentiment_score', 'product_price'])

    # 🔹 Feature matrix
    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)

    X = df[['overall_rating', 'neutral_distance']].values

    X = MinMaxScaler().fit_transform(X)

    # 🔹 Reshape for CNN
    # Shape: (samples, timesteps, channels)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 INPUT
    input_layer = Input(shape=(X_seq.shape[1], 1))

    # 🔹 CNN layers
    conv1 = Conv1D(filters=64, kernel_size=2, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=32, kernel_size=2, activation='relu', padding='same')(conv1)

    # 🔹 ATTENTION
    attention_out = Attention()([conv2, conv2])

    # 🔹 POOLING
    pooled = GlobalAveragePooling1D()(attention_out)

    # 🔹 OUTPUT (reconstruction)
    output_layer = Dense(X.shape[1], activation='linear')(pooled)

    # 🔹 MODEL
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.MeanSquaredError()
    )

    model.fit(
        X_seq,
        X,
        epochs=30,
        batch_size=32,
        verbose=1
    )

    model.save("cnn_attention_recommender.h5")

    print("✅ CNN + Attention recommender trained")

# ===============================
# PREPARE RECOMMENDATION DATA
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df.dropna(subset=[
        'product_name', 'category',
        'product_price', 'overall_rating', 'sentiment_score'
    ])

    features = df[['overall_rating', 'sentiment_score', 'product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    similarity_matrix = cosine_similarity(features_scaled)

    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")

    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text'] = df['product_name'] + " " + df['category']
    embeddings = model_embed.encode(df['text'].tolist(), convert_to_tensor=True)

    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("✅ Recommendation data ready")


# ===============================
# RECOMMEND PRODUCTS
# ===============================
def recommend_products(user_input, top_n=5):

    df = joblib.load("products_data.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # 🔹 Load CNN model
    cnn_model = tf.keras.models.load_model("cnn_attention_recommender.h5")

    # 🔹 Prepare features
    df['neutral_distance'] = np.abs(df['overall_rating'] - 3)
    X = df[['overall_rating', 'neutral_distance']].values
    X = MinMaxScaler().fit_transform(X)
    X_seq = X.reshape((X.shape[0], X.shape[1], 1))

    # 🔹 CNN reconstruction error
    recon = cnn_model.predict(X_seq, verbose=0)
    recon_error = np.mean(np.square(X - recon), axis=1)
    df['dl_score'] = -recon_error

    # 🔹 Semantic similarity
    query_emb = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(
        query_emb.reshape(1, -1), embeddings
    )[0]

    # 🔹 Final hybrid score
    df['final_score'] = 0.7 * semantic_scores + 0.3 * df['dl_score'].values

    top_idx = df['final_score'].nlargest(top_n).index

    print(f"\n✅ Recommendations for '{user_input}':\n")

    for i in top_idx:
        p = df.loc[i]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Price: ₹{p['product_price']}")
        print("-" * 50)


# ===============================
# MAIN
# ===============================
if __name__ == "__main__":

    DATA_FOLDER = r"/content/Wearables"

    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Clean price globally
    df['product_price'] = (
        df['product_price']
        .astype(str)
        .str.replace(r'[₹,\s]', '', regex=True)
    )
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')

    train_dl_sentiment_model(df)
    train_cnn_attention_recommender(df)
    prepare_recommendation_data(df)

    while True:
        name = input("\nEnter product name (or exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 150 records

--- TRAINING DL SENTIMENT CLASSIFIER (BALANCED) ---
Class weights: {0: np.float64(7.611111111111111), 1: np.float64(0.9513888888888888), 2: np.float64(0.5502008032128514)}
Epoch 1/30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['neutral_distance'] = np.abs(df['overall_rating'] - 3)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.1783 - loss: 1.0273   
Epoch 2/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6264 - loss: 1.0745 
Epoch 3/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6328 - loss: 1.0960 
Epoch 4/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6373 - loss: 1.0476 
Epoch 5/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6430 - loss: 1.0489
Epoch 6/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6465 - loss: 1.1101 
Epoch 7/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6503 - loss: 1.0522 
Epoch 8/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6616 - loss: 1.0757 
Epoch 9/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 



              precision    recall  f1-score   support

    negative       1.00      0.17      0.29         6
     neutral       0.82      0.38      0.51        48
    positive       0.73      1.00      0.84        83

    accuracy                           0.74       137
   macro avg       0.85      0.51      0.55       137
weighted avg       0.77      0.74      0.70       137

✅ DL sentiment model trained with class balancing

--- TRAINING CNN + ATTENTION RECOMMENDER ---
Epoch 1/30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['neutral_distance'] = np.abs(df['overall_rating'] - 3)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - loss: 0.3711
Epoch 2/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.2636 
Epoch 3/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.1804 
Epoch 4/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1105
Epoch 5/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0502
Epoch 6/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0127 
Epoch 7/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0050 
Epoch 8/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0101 
Epoch 9/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0103 
Epoch 10/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0040 
Epoch 11/30
[1m5/5[0m [32m━



✅ CNN + Attention recommender trained

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['product_name'] + " " + df['category']


✅ Recommendation data ready

Enter product name (or exit): exit
