<a href="https://colab.research.google.com/github/surabhi-2404/product_recommendation_witn_webscrape/blob/main/ML_final_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip /content/electronics.zip

Archive:  /content/electronics.zip
   creating: electronics/
   creating: electronics/.ipynb_checkpoints/
  inflating: electronics/.ipynb_checkpoints/AirConditionerselectronics_dataset_flipkart-checkpoint.csv  
  inflating: electronics/.ipynb_checkpoints/electronics_code-checkpoint.ipynb  
  inflating: electronics/.ipynb_checkpoints/laptop_dataset_flipkart-checkpoint.csv  
  inflating: electronics/.ipynb_checkpoints/recommendation_model-checkpoint.ipynb  
  inflating: electronics/.ipynb_checkpoints/refrigerators_dataset_flipkart-checkpoint.csv  
  inflating: electronics/.ipynb_checkpoints/smartphones_dataset_flipkart-checkpoint.csv  
  inflating: electronics/.ipynb_checkpoints/televisions_dataset_flipkart-checkpoint.csv  
  inflating: electronics/AirConditionerselectronics_dataset_flipkart.csv  
  inflating: electronics/electronics_code.ipynb  
  inflating: electronics/label_encoder.pkl  
  inflating: electronics/laptop_dataset_flipkart.csv  
  inflating: electronics/products_data.pkl 

In [None]:
!pip install sentence_transformers



In [None]:
#logisticregression
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    X = df_balanced[['weighted_rating']].values
    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
            y,
            model.predict(X),
            target_names=le.classes_,
            zero_division=0
        ))

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)

        print("\nLogistic Regression(B&C):\n")
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    final_model = LogisticRegression(
        max_iter=1000,
        solver='liblinear',
        class_weight='balanced'
    )
    final_model.fit(X, y)

    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    similarity_matrix = cosine_similarity(features_scaled)
    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")
    joblib.dump(scaler, "scaler.pkl")

    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/beauty and care"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 6 files with 600 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 438, 'neutral': 76, 'negative': 1})

Logistic Regression(B&C):


Accuracy: 0.6948

Classification Report:

              precision    recall  f1-score   support

    negative       0.98      1.00      0.99       438
     neutral       1.00      0.08      0.16       438
    positive       0.53      1.00      0.69       438

    accuracy                           0.69      1314
   macro avg       0.84      0.69      0.61      1314
weighted avg       0.84      0.69      0.61      1314

✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): exit


In [None]:
#random forest
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = int(np.mean(list(class_counts.values())))


    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    X = df_balanced[['weighted_rating']].values
    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
            y,
            model.predict(X),
            target_names=le.classes_,
            zero_division=0
        ))

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)

        print("\nRandom Forest B&C:\n")
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    from sklearn.ensemble import RandomForestClassifier

    final_model = RandomForestClassifier(
          n_estimators=200,
          max_depth=5,
          random_state=42,
          class_weight='balanced'
        )

    final_model.fit(X, y)

    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    similarity_matrix = cosine_similarity(features_scaled)
    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")
    joblib.dump(scaler, "scaler.pkl")

    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/beauty and care"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 6 files with 600 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 438, 'neutral': 76, 'negative': 1})

Random Forest B&C:


Accuracy: 0.7808

Classification Report:

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       171
     neutral       0.00      0.00      0.00       171
    positive       0.72      1.00      0.84       438

    accuracy                           0.78       780
   macro avg       0.57      0.67      0.61       780
weighted avg       0.62      0.78      0.69       780

✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---




Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): exit


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#MLP classifier
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    X = df_balanced[['weighted_rating']].values
    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
            y,
            model.predict(X),
            target_names=le.classes_,
            zero_division=0
        ))

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)

        print("\nMLP B&C:\n")
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    from sklearn.neural_network import MLPClassifier

    final_model = MLPClassifier(
        hidden_layer_sizes=(32,16),
        activation='relu',
        max_iter=500,
        random_state=42
    )

    final_model.fit(X, y)

    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    similarity_matrix = cosine_similarity(features_scaled)
    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")
    joblib.dump(scaler, "scaler.pkl")

    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/beauty and care"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 6 files with 600 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 438, 'neutral': 76, 'negative': 1})

MLP B&C:


Accuracy: 0.6948

Classification Report:

              precision    recall  f1-score   support

    negative       0.98      1.00      0.99       438
     neutral       1.00      0.08      0.16       438
    positive       0.53      1.00      0.69       438

    accuracy                           0.69      1314
   macro avg       0.84      0.69      0.61      1314
weighted avg       0.84      0.69      0.61      1314

✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): exit


In [None]:
#KNN(nonmodified)
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    X = df_balanced[['weighted_rating']].values
    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
            y,
            model.predict(X),
            target_names=le.classes_,
            zero_division=0
        ))

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)
        print("\nKNN B&C:\n")
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    from sklearn.neural_network import MLPClassifier

    model = MLPClassifier(
        hidden_layer_sizes=(32,16),
        activation='relu',
        max_iter=500,
        random_state=42
    )

    final_model.fit(X, y)

    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    from sklearn.neighbors import NearestNeighbors

    nn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
    nn_model.fit(features_scaled)
    joblib.dump(nn_model, "nn_model.pkl")


    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/beauty and care"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 6 files with 600 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 438, 'neutral': 76, 'negative': 1})

KNN B&C:


Classification Report:

              precision    recall  f1-score   support

    negative       0.98      1.00      0.99       438
     neutral       1.00      0.08      0.16       438
    positive       0.53      1.00      0.69       438

    accuracy                           0.69      1314
   macro avg       0.84      0.69      0.61      1314
weighted avg       0.84      0.69      0.61      1314



NameError: name 'final_model' is not defined

In [None]:
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    X = df_balanced[['weighted_rating']].values
    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
            y,
            model.predict(X),
            target_names=le.classes_,
            zero_division=0
        ))

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    from sklearn.neural_network import MLPClassifier

    final_model = MLPClassifier(
        hidden_layer_sizes=(32,16),
        activation='relu',
        max_iter=500,
        random_state=42
    )

    final_model.fit(X, y)

    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    from sklearn.neighbors import NearestNeighbors

    nn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
    nn_model.fit(features_scaled)
    joblib.dump(nn_model, "nn_model.pkl")


    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/beauty and care"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 6 files with 600 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 438, 'neutral': 76, 'negative': 1})

Accuracy: 0.6948

Classification Report:

              precision    recall  f1-score   support

    negative       0.98      1.00      0.99       438
     neutral       1.00      0.08      0.16       438
    positive       0.53      1.00      0.69       438

    accuracy                           0.69      1314
   macro avg       0.84      0.69      0.61      1314
weighted avg       0.84      0.69      0.61      1314

✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): exit


In [None]:
#modified code
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    # Additional discriminative features
    df_balanced['rating_diff'] = df_balanced['overall_rating'] - df_balanced['weighted_rating']

    X = df_balanced[
        ['overall_rating', 'weighted_rating', 'rating_diff']
    ].values

    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
                  y_true,
                  y_pred,
                  labels=[0, 1, 2],
                  target_names=le.classes_,
                  zero_division=0
              ) )

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)


        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    final_model = LogisticRegression(
    max_iter=2000,
    solver='lbfgs',
    multi_class='auto',
    class_weight='balanced'
      )
    final_model.fit(X, y)


    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")
    joblib.dump(scaler, "sentiment_scaler.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    similarity_matrix = cosine_similarity(features_scaled)
    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")
    joblib.dump(scaler, "scaler.pkl")

    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/beauty and care"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 6 files with 600 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 438, 'neutral': 76, 'negative': 1})

Accuracy: 0.9954

Classification Report:

              precision    recall  f1-score   support

    negative       0.99      1.00      0.99       438
     neutral       1.00      0.99      0.99       438
    positive       1.00      1.00      1.00       438

    accuracy                           1.00      1314
   macro avg       1.00      1.00      1.00      1314
weighted avg       1.00      1.00      1.00      1314

✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---




Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): exit


In [None]:
#modified code(beauty and care)
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    # Additional discriminative features
    df_balanced['rating_diff'] = df_balanced['overall_rating'] - df_balanced['weighted_rating']

    X = df_balanced[
        ['overall_rating', 'weighted_rating', 'rating_diff']
    ].values

    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
                  y_true,
                  y_pred,
                  labels=[0, 1, 2],
                  target_names=le.classes_,
                  zero_division=0
              ) )

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)


        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    from sklearn.ensemble import RandomForestClassifier

    final_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        class_weight='balanced',
        random_state=42
    )
    final_model.fit(X, y)



    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")
    joblib.dump(scaler, "sentiment_scaler.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    similarity_matrix = cosine_similarity(features_scaled)
    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")
    joblib.dump(scaler, "scaler.pkl")

    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/beauty and care"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 6 files with 600 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 438, 'neutral': 76, 'negative': 1})

Accuracy: 0.9954

Classification Report:

              precision    recall  f1-score   support

    negative       0.99      1.00      0.99       438
     neutral       1.00      0.99      0.99       438
    positive       1.00      1.00      1.00       438

    accuracy                           1.00      1314
   macro avg       1.00      1.00      1.00      1314
weighted avg       1.00      1.00      1.00      1314

✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): exit


In [None]:
#modified code(electronics)
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    # Additional discriminative features
    df_balanced['rating_diff'] = df_balanced['overall_rating'] - df_balanced['weighted_rating']

    X = df_balanced[
        ['overall_rating', 'weighted_rating', 'rating_diff']
    ].values

    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
                  y_true,
                  y_pred,
                  labels=[0, 1, 2],
                  target_names=le.classes_,
                  zero_division=0
              ) )

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)


        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    from sklearn.ensemble import RandomForestClassifier

    final_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        class_weight='balanced',
        random_state=42
    )
    final_model.fit(X, y)



    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")
    joblib.dump(scaler, "sentiment_scaler.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    similarity_matrix = cosine_similarity(features_scaled)
    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")
    joblib.dump(scaler, "scaler.pkl")

    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/electronics"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 5 files with 500 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 457, 'neutral': 42, 'negative': 1})

Accuracy: 1.0

Classification Report:

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       457
     neutral       1.00      1.00      1.00       457
    positive       1.00      1.00      1.00       457

    accuracy                           1.00      1371
   macro avg       1.00      1.00      1.00      1371
weighted avg       1.00      1.00      1.00      1371

✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): exit


In [None]:
#nonmodified(electronics)
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    X = df_balanced[['weighted_rating']].values
    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
            y,
            model.predict(X),
            target_names=le.classes_,
            zero_division=0
        ))

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    from sklearn.neural_network import MLPClassifier

    final_model = MLPClassifier(
        hidden_layer_sizes=(32,16),
        activation='relu',
        max_iter=500,
        random_state=42
    )

    final_model.fit(X, y)

    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    from sklearn.neighbors import NearestNeighbors

    nn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
    nn_model.fit(features_scaled)
    joblib.dump(nn_model, "nn_model.pkl")


    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/electronics"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 5 files with 500 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 457, 'neutral': 42, 'negative': 1})

Accuracy: 0.7002

Classification Report:

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       457
     neutral       1.00      0.10      0.18       457
    positive       0.53      1.00      0.69       457

    accuracy                           0.70      1371
   macro avg       0.84      0.70      0.62      1371
weighted avg       0.84      0.70      0.62      1371





✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): tv

✅ Recommended products similar to 'tv':

Product: Add to Compare
BESTON 80 cm (32 inch) HD Ready LED Smart Android TV 2025 Edition
3.94,925 Ratings & 616 Reviews
HD Ready | LED
Model ID: BS32HD1
Launch Year: 2025
Total Sound Output: 20 W
2 Year warranty on Product , Onsite Warranty on Product and No Delivery Damages Covered
₹7,359
₹17,99959% off
Hot Deal
Upto 
₹1,900
 Off on Exchange
Category: Electronics
Rating: 3.9 | Sentiment: neutral | Price: ₹7359
Semantic Similarity Score: 0.42089998722076416
------------------------------------------------------------
Product: Add to Compare
iFFALCON by TCL S55 80 cm (32 inch) HD Ready LED Smart Google TV 2025 Edition with HDR 10 | 16W Dolby ...
4.16,693 Ratings & 443 Reviews
HD Ready | LED
Model ID: 32S55
Launch Year: 2025
Total Sound Output: 16 W
1 Year Product Warranty
₹8,799
₹19,99055% off
Upto 
₹1,900
 Off on Exchange
Bank Offer
Category: El

In [None]:
#modified code(footwear)
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    # Additional discriminative features
    df_balanced['rating_diff'] = df_balanced['overall_rating'] - df_balanced['weighted_rating']

    X = df_balanced[
        ['overall_rating', 'weighted_rating', 'rating_diff']
    ].values

    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)

        print("\nClassification Report (Train Set):\n")
        print(classification_report(
                  y_true,
                  y_pred,
                  labels=[0, 1, 2],
                  target_names=le.classes_,
                  zero_division=0
              ) )

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)


        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    from sklearn.ensemble import RandomForestClassifier

    final_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        class_weight='balanced',
        random_state=42
    )
    final_model.fit(X, y)



    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")
    joblib.dump(scaler, "sentiment_scaler.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    similarity_matrix = cosine_similarity(features_scaled)
    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")
    joblib.dump(scaler, "scaler.pkl")

    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/footwear"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 2 files with 271 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 167, 'neutral': 97, 'negative': 3})

Accuracy: 0.8663

Classification Report:

              precision    recall  f1-score   support

    negative       0.95      1.00      0.97       167
     neutral       1.00      0.60      0.75       167
    positive       0.74      1.00      0.85       167

    accuracy                           0.87       501
   macro avg       0.90      0.87      0.86       501
weighted avg       0.90      0.87      0.86       501

✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): exit


In [None]:
#modified code(wearables)
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# ===============================
# LOAD DATA
# ===============================
def load_datasets_from_folder(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    if len(all_files) == 0:
        raise Exception("No CSV files found in the folder!")

    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(all_files)} files with {len(df)} records.")
    return df

# ===============================
# FILL AND ENCODE SENTIMENT
# ===============================
def fill_and_encode_sentiment(df):
    print("\n--- PROCESSING SENTIMENT ---")

    # Fill missing sentiment based on overall_rating
    def rating_to_sentiment(r):
        if pd.isna(r):
            return 'na'
        elif r >= 4.0:
            return 'positive'
        elif r >= 3.0:
            return 'neutral'
        else:
            return 'negative'

    df['sentiment'] = df['sentiment'].fillna(df['overall_rating'].apply(rating_to_sentiment))
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

    # Map to numeric score for recommendation
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment_score'] = df['sentiment'].map(sentiment_map)

    return df

def compute_weighted_rating(df, m=None):
    """
    Bayesian weighted rating
    """
    C = df['overall_rating'].mean()
    if m is None:
        m = df['overall_rating'].quantile(0.25)

    df['weighted_rating'] = (
        (df['overall_rating'] * 1.0 + C * m) / (1 + m)
    )
    return df

# ===============================
# TRAIN SENTIMENT MODEL
# ===============================
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from collections import Counter

def train_sentiment_model(df):
    print("\n--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---")

    # ===============================
    # SELECT REQUIRED COLUMNS
    # ===============================
    df_model = df[['overall_rating', 'sentiment']].copy()
    df_model['overall_rating'] = pd.to_numeric(
        df_model['overall_rating'], errors='coerce'
    )
    df_model = df_model.dropna()

    # ===============================
    # COMPUTE MODIFIED RATING
    # ===============================
    df_model = compute_weighted_rating(df_model)

    # ===============================
    # CHECK CLASS DISTRIBUTION
    # ===============================
    class_counts = Counter(df_model['sentiment'])
    print("Class distribution before balancing:", class_counts)

    # ===============================
    # BALANCE DATA (CONTROLLED)
    # ===============================
    max_size = max(class_counts.values())

    balanced_dfs = []
    for label in class_counts:
        subset = df_model[df_model['sentiment'] == label]

        if len(subset) < max_size:
            subset = resample(
                subset,
                replace=True,
                n_samples=max_size,
                random_state=42
            )

        balanced_dfs.append(subset)

    df_balanced = pd.concat(balanced_dfs)
    # ===============================
    # FEATURES & LABELS
    # ===============================
    # Additional discriminative features
    df_balanced['rating_diff'] = df_balanced['overall_rating'] - df_balanced['weighted_rating']

    X = df_balanced[
        ['overall_rating', 'weighted_rating', 'rating_diff']
    ].values

    le = LabelEncoder()
    y = le.fit_transform(df_balanced['sentiment'])

    # ===============================
    # AUTO SAFE SPLITS
    # ===============================
    min_class_size = min(Counter(y).values())
    n_splits = min(5, min_class_size)

    if n_splits < 2:
        print("⚠️ Too few samples for CV. Training without cross-validation.")

        model = LogisticRegression(
            max_iter=1000,
            solver='liblinear',
            class_weight='balanced'
        )
        model.fit(X, y)
        y_pred.extend(model.predict(X_test))
        y_true.extend(y_test)
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report (Train Set):\n")
        print(classification_report(
                  y_true,
                  y_pred,
                  labels=[0, 1, 2],
                  target_names=le.classes_,
                  zero_division=0
              ) )

    else:
        skf = StratifiedKFold(
            n_splits=n_splits,
            shuffle=True,
            random_state=42
        )

        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)


        y_true, y_pred = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = LogisticRegression(
                max_iter=1000,
                solver='liblinear',
                class_weight='balanced'
            )
            model.fit(X_train, y_train)

            y_pred.extend(model.predict(X_test))
            y_true.extend(y_test)
        print("\nAccuracy:", round(accuracy_score(y_true, y_pred), 4))
        print("\nClassification Report:\n")
        print(classification_report(
            y_true,
            y_pred,
            target_names=le.classes_,
            zero_division=0
        ))

    # ===============================
    # FINAL MODEL
    # ===============================
    from sklearn.ensemble import RandomForestClassifier

    final_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        class_weight='balanced',
        random_state=42
    )
    final_model.fit(X, y)



    joblib.dump(final_model, "sentiment_model.pkl")
    joblib.dump(le, "label_encoder.pkl")
    joblib.dump(scaler, "sentiment_scaler.pkl")

    print("✅ Sentiment model trained safely with class balancing.")
    return final_model, le

# ===============================
# PREPARE DATA FOR RECOMMENDATION
# ===============================
def prepare_recommendation_data(df):
    print("\n--- PREPARING RECOMMENDATION DATA ---")

    df = df[['product_name', 'product_price', 'sentiment', 'sentiment_score', 'category', 'overall_rating']].copy()

    # Clean numeric columns
    df['product_price'] = df['product_price'].astype(str).str.replace(r'[₹,\s]', '', regex=True)
    df['product_price'] = pd.to_numeric(df['product_price'], errors='coerce')
    df['overall_rating'] = pd.to_numeric(df['overall_rating'], errors='coerce')

    # Drop missing critical values
    df = df.dropna(subset=['product_name', 'product_price', 'sentiment_score', 'overall_rating', 'category'])

    # Train MinMax scaler for numeric features
    features = df[['overall_rating','sentiment_score','product_price']]
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Save numeric similarity
    similarity_matrix = cosine_similarity(features_scaled)
    joblib.dump(df, "products_data.pkl")
    joblib.dump(similarity_matrix, "similarity_matrix.pkl")
    joblib.dump(scaler, "scaler.pkl")

    # Precompute embeddings for semantic search
    model_embed = SentenceTransformer('all-MiniLM-L6-v2')
    df['text_for_embedding'] = df['product_name'].astype(str) + " " + df['category'].astype(str)
    embeddings = model_embed.encode(df['text_for_embedding'].tolist(), convert_to_tensor=True)
    joblib.dump(embeddings.cpu().numpy(), "product_embeddings.pkl")
    joblib.dump(model_embed, "embedding_model.pkl")

    print("Recommendation data and embeddings saved.")
    return df

# ===============================
# RECOMMEND PRODUCTS (SEMANTIC + NUMERIC)
# ===============================
def recommend_products(user_input, top_n=5):
    df = joblib.load("products_data.pkl")
    similarity_matrix = joblib.load("similarity_matrix.pkl")
    embeddings = joblib.load("product_embeddings.pkl")
    model_embed = joblib.load("embedding_model.pkl")

    # Compute semantic similarity
    input_embedding = model_embed.encode(user_input, convert_to_tensor=True)
    semantic_scores = cosine_similarity(input_embedding.reshape(1,-1), embeddings)[0]

    # Combine semantic score and numeric similarity
    combined_scores = []
    for i in range(len(df)):
        # Take average of numeric similarity and semantic similarity
        numeric_score = similarity_matrix[i][i]  # self-similarity is 1
        combined_scores.append((i, semantic_scores[i]))  # using semantic primarily

    # Sort by semantic similarity
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n✅ Recommended products similar to '{user_input}':\n")
    count = 0
    for idx, score in combined_scores[:top_n]:
        p = df.iloc[idx]
        print(f"Product: {p['product_name']}")
        print(f"Category: {p['category']}")
        print(f"Rating: {p['overall_rating']} | Sentiment: {p['sentiment']} | Price: ₹{p['product_price']}")
        print(f"Semantic Similarity Score: {round(score, 4)}")
        print("-"*60)
        count += 1

# ===============================
# MAIN
# ===============================
if __name__ == "__main__":
    DATA_FOLDER = r"/content/Wearables"

    # Load data
    df = load_datasets_from_folder(DATA_FOLDER)
    df = fill_and_encode_sentiment(df)

    # Train sentiment model
    train_sentiment_model(df)

    # Prepare recommendation data (numeric + embeddings)
    prepare_recommendation_data(df)

    # Interactive loop
    while True:
        name = input("\nEnter product name for recommendation (or type exit): ")
        if name.lower() == "exit":
            break
        recommend_products(name)


Loaded 2 files with 150 records.

--- PROCESSING SENTIMENT ---

--- TRAINING SENTIMENT MODEL (AUTO-BALANCED & SAFE CV) ---
Class distribution before balancing: Counter({'positive': 83, 'neutral': 48, 'negative': 6})

Accuracy: 0.7631

Classification Report:

              precision    recall  f1-score   support

    negative       0.99      1.00      0.99        83
     neutral       1.00      0.29      0.45        83
    positive       0.59      1.00      0.74        83

    accuracy                           0.76       249
   macro avg       0.86      0.76      0.73       249
weighted avg       0.86      0.76      0.73       249

✅ Sentiment model trained safely with class balancing.

--- PREPARING RECOMMENDATION DATA ---


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Recommendation data and embeddings saved.

Enter product name for recommendation (or type exit): exit
