<a href="https://colab.research.google.com/github/yjyuwisely/Bigdata-project/blob/main/CSCI946_Final_Naive_Bayes_%26_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CSCI946 Assignment 1 - Task 3: Classification (Modified)
# Naive Bayes and Logistic Regression for Category Classification

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# ============================================================================
# TASK 3: CLASSIFICATION - PROPER APPROACH
# ============================================================================

def prepare_classification_data(df):
    """
    Prepare data for classification - predicting product category
    This approach aligns with assignment goals of finding best category
    """
    print("=== Data Preparation for Classification ===")

    # Select features for classification (numerical features only as per assignment)
    feature_cols = ['current_price', 'raw_price', 'discount', 'likes_count']

    # Add derived features
    df['price_diff'] = df['raw_price'] - df['current_price']
    df['log_current_price'] = np.log1p(df['current_price'])
    df['log_likes'] = np.log1p(df['likes_count'])

    # Final feature set
    feature_cols_extended = ['log_current_price', 'discount', 'log_likes', 'price_diff']

    # Prepare X (features) and y (target - category)
    X = df[feature_cols_extended].copy()
    y = df['category'].copy()

    print(f"Features selected: {feature_cols_extended}")
    print(f"Target variable: category")
    print(f"Number of samples: {len(X)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Category distribution:")
    print(y.value_counts())

    return X, y, feature_cols_extended

def perform_naive_bayes_classification(X_train, X_test, y_train, y_test):
    """
    Perform Naive Bayes Classification
    """
    print("\n=== NAIVE BAYES CLASSIFICATION ===")

    # Initialize and train Gaussian Naive Bayes
    nb_classifier = GaussianNB()
    nb_classifier.fit(X_train, y_train)

    # Make predictions
    y_pred_nb = nb_classifier.predict(X_test)
    y_prob_nb = nb_classifier.predict_proba(X_test)

    # Calculate metrics
    accuracy_nb = accuracy_score(y_test, y_pred_nb)

    print("Naive Bayes Results:")
    print(f"Accuracy: {accuracy_nb:.4f}")
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred_nb))

    # Confusion Matrix
    cm_nb = confusion_matrix(y_test, y_pred_nb)
    print("\nConfusion Matrix:")
    print(cm_nb)

    return nb_classifier, y_pred_nb, accuracy_nb

def perform_logistic_regression_classification(X_train, X_test, y_train, y_test):
    """
    Perform Logistic Regression Classification with proper preprocessing
    """
    print("\n=== LOGISTIC REGRESSION CLASSIFICATION ===")

    # Create pipeline with scaling (important for Logistic Regression)
    lr_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(
            max_iter=2000,
            solver='lbfgs',
            multi_class='multinomial',
            random_state=42
        ))
    ])

    # Train the model
    lr_pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred_lr = lr_pipeline.predict(X_test)
    y_prob_lr = lr_pipeline.predict_proba(X_test)

    # Calculate metrics
    accuracy_lr = accuracy_score(y_test, y_pred_lr)

    print("Logistic Regression Results:")
    print(f"Accuracy: {accuracy_lr:.4f}")
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred_lr))

    # Confusion Matrix
    cm_lr = confusion_matrix(y_test, y_pred_lr)
    print("\nConfusion Matrix:")
    print(cm_lr)

    return lr_pipeline, y_pred_lr, accuracy_lr

def compare_classification_results(X, y, nb_model, lr_model, nb_accuracy, lr_accuracy):
    """
    Compare classification results using cross-validation
    """
    print("\n=== MODEL COMPARISON ===")

    # Cross-validation scores
    nb_cv_scores = cross_val_score(nb_model, X, y, cv=5, scoring='accuracy')
    lr_cv_scores = cross_val_score(lr_model, X, y, cv=5, scoring='accuracy')

    print(f"Naive Bayes Cross-Validation Accuracy: {nb_cv_scores.mean():.4f} (+/- {nb_cv_scores.std() * 2:.4f})")
    print(f"Logistic Regression Cross-Validation Accuracy: {lr_cv_scores.mean():.4f} (+/- {lr_cv_scores.std() * 2:.4f})")

    # Create comparison table
    comparison_df = pd.DataFrame({
        'Algorithm': ['Naive Bayes', 'Logistic Regression'],
        'Test Accuracy': [nb_accuracy, lr_accuracy],
        'CV Mean Accuracy': [nb_cv_scores.mean(), lr_cv_scores.mean()],
        'CV Std Deviation': [nb_cv_scores.std(), lr_cv_scores.std()]
    })

    print("\nComparison Summary:")
    print(comparison_df.to_string(index=False))

    return comparison_df

def analyze_feature_importance(lr_model, feature_names, categories):
    """
    Analyze feature importance from Logistic Regression coefficients
    """
    print("\n=== FEATURE IMPORTANCE ANALYSIS ===")

    # Get coefficients (for multi-class, we have coefficients for each class)
    coefficients = lr_model.named_steps['classifier'].coef_

    # Create feature importance DataFrame
    feature_importance = pd.DataFrame(
        coefficients.T,
        index=feature_names,
        columns=categories
    )

    print("Logistic Regression Coefficients by Category:")
    print(feature_importance.round(4))

    return feature_importance

def identify_best_category_and_products(df, nb_model, lr_model, X, y):
    """
    Use classification results to identify best category and top products
    This addresses the main assignment objectives
    """
    print("\n=== ASSIGNMENT OBJECTIVES: BEST CATEGORY & TOP PRODUCTS ===")

    # Calculate category performance metrics
    category_stats = df.groupby('category').agg({
        'likes_count': ['mean', 'std', 'count'],
        'discount': 'mean',
        'current_price': 'mean',
        'price_diff': 'mean'
    }).round(2)

    category_stats.columns = ['avg_likes', 'std_likes', 'product_count',
                             'avg_discount', 'avg_price', 'avg_price_diff']

    # Calculate composite category score
    category_stats['category_score'] = (
        0.5 * (category_stats['avg_likes'] / category_stats['avg_likes'].max()) +
        0.3 * (category_stats['avg_discount'] / 100) +
        0.2 * (category_stats['avg_price_diff'] / category_stats['avg_price_diff'].max())
    )

    # Sort by category score
    category_ranking = category_stats.sort_values('category_score', ascending=False)

    print("Category Performance Ranking:")
    print(category_ranking[['avg_likes', 'avg_discount', 'category_score']])

    # Identify best category
    best_category = category_ranking.index[0]
    print(f"\nBest Category: {best_category}")

    # Get top 10 products overall based on composite score
    df['product_score'] = (
        0.6 * (df['likes_count'] / df['likes_count'].max()) +
        0.3 * (df['discount'] / 100) +
        0.1 * (df['price_diff'] / df['price_diff'].max())
    )

    top_10_products = df.nlargest(10, 'product_score')[
        ['category', 'current_price', 'likes_count', 'discount', 'product_score']
    ].round(3)

    print("\nTop 10 Products:")
    print(top_10_products.to_string())

    return best_category, top_10_products, category_ranking

In [None]:
# ============================================================================
# MAIN EXECUTION FUNCTION
# ============================================================================

def run_task3_classification(df):
    """
    Main function to run Task 3 Classification analysis
    """
    print("CSCI946 Assignment 1 - Task 3: Classification Analysis")
    print("=" * 60)

    # Step 1: Prepare data
    X, y, feature_names = prepare_classification_data(df)

    # Step 2: Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    print(f"\nData split:")
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")

    # Step 3: Perform Naive Bayes Classification
    nb_model, nb_predictions, nb_accuracy = perform_naive_bayes_classification(
        X_train, X_test, y_train, y_test
    )

    # Step 4: Perform Logistic Regression Classification
    lr_model, lr_predictions, lr_accuracy = perform_logistic_regression_classification(
        X_train, X_test, y_train, y_test
    )

    # Step 5: Compare Results
    comparison_results = compare_classification_results(
        X, y, nb_model, lr_model, nb_accuracy, lr_accuracy
    )

    # Step 6: Feature Importance Analysis
    categories = sorted(y.unique())
    feature_importance = analyze_feature_importance(lr_model, feature_names, categories)

    # Step 7: Address Assignment Objectives
    best_category, top_10_products, category_ranking = identify_best_category_and_products(
        df, nb_model, lr_model, X, y
    )

    # Return results for reporting
    results = {
        'nb_model': nb_model,
        'lr_model': lr_model,
        'nb_accuracy': nb_accuracy,
        'lr_accuracy': lr_accuracy,
        'comparison_df': comparison_results,
        'feature_importance': feature_importance,
        'best_category': best_category,
        'top_10_products': top_10_products,
        'category_ranking': category_ranking
    }

    return results

In [None]:
# ============================================================================
# USAGE EXAMPLE
# ============================================================================

#Load your preprocessed data (assuming you have df from previous tasks)
df = pd.read_csv('cleaned_combined_dataset.csv')  # Your preprocessed data

#Run the classification analysis
results = run_task3_classification(df)

#Example of how to save results for your report
results['comparison_df'].to_csv('classification_comparison.csv', index=False)
results['top_10_products'].to_csv('top_10_products.csv', index=False)
results['category_ranking'].to_csv('category_ranking.csv', index=False)

CSCI946 Assignment 1 - Task 3: Classification Analysis
=== Data Preparation for Classification ===
Features selected: ['log_current_price', 'discount', 'log_likes', 'price_diff']
Target variable: category
Number of samples: 52733
Number of features: 4
Category distribution:
category
women      14809
house      12791
men        10208
bags        6268
jewelry     4853
beauty      3804
Name: count, dtype: int64

Data split:
Training set: 36913 samples
Test set: 15820 samples

=== NAIVE BAYES CLASSIFICATION ===
Naive Bayes Results:
Accuracy: 0.4335

Detailed Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

        bags       0.22      0.05      0.08      1880
      beauty       0.27      0.01      0.02      1141
       house       0.44      0.50      0.47      3837
     jewelry       0.00      0.00      0.00      1456
         men       0.63      0.29      0.40      3063
       women       0.41      0.89      0.56      4443

    accuracy                           0.43     15820
   macro avg       0.33      0.29      0.25     15820
weighted avg       0.39      0.43      0.36     15820


Confusion Matrix:
[[  88    2  241    0  152 1397]
 [  26   11  511    0   76  517]
 [  85   17 1931    0  193 1611]
 [  30    4  871    0   36  515]
 [  83    6  487    0  890 1597]
 [  83    1  354    0   67 3938]]

=== LOGISTIC REGRESSION CLASSIFICATION ===




Logistic Regression Results:
Accuracy: 0.4573

Detailed Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

        bags       0.23      0.02      0.04      1880
      beauty       0.00      0.00      0.00      1141
       house       0.41      0.54      0.47      3837
     jewelry       0.34      0.02      0.03      1456
         men       0.52      0.46      0.49      3063
       women       0.47      0.83      0.60      4443

    accuracy                           0.46     15820
   macro avg       0.33      0.31      0.27     15820
weighted avg       0.39      0.46      0.38     15820


Confusion Matrix:
[[  43    0  330    0  327 1180]
 [  33    0  616   10  148  334]
 [  42    0 2057   11  511 1216]
 [  21    0  991   25   38  381]
 [  21    0  565   15 1418 1044]
 [  24    0  424   13  290 3692]]

=== MODEL COMPARISON ===




Naive Bayes Cross-Validation Accuracy: 0.3987 (+/- 0.1198)
Logistic Regression Cross-Validation Accuracy: 0.4434 (+/- 0.0743)

Comparison Summary:
          Algorithm  Test Accuracy  CV Mean Accuracy  CV Std Deviation
        Naive Bayes       0.433502          0.398651          0.059882
Logistic Regression       0.457332          0.443366          0.037144

=== FEATURE IMPORTANCE ANALYSIS ===
Logistic Regression Coefficients by Category:
                     bags  beauty   house  jewelry     men   women
log_current_price  0.7217 -0.9830 -1.1979  -1.2537  0.4227  2.2902
discount           0.4523 -0.6659 -0.8046  -0.4648 -0.5436  2.0266
log_likes         -0.0038 -0.0949 -0.0026   0.0912 -0.1090  0.1191
price_diff        -0.4018  1.4420  1.4781   1.2802 -0.5509 -3.2477

=== ASSIGNMENT OBJECTIVES: BEST CATEGORY & TOP PRODUCTS ===
Category Performance Ranking:
          avg_likes  avg_discount  category_score
category                                         
women        238.17         55.

In [None]:
# CSCI946 Assignment 1 - Task 3: Classification Analysis
# ============================================================
# === Data Preparation for Classification ===
# Features selected: ['log_current_price', 'discount', 'log_likes', 'price_diff']
# Target variable: category
# Number of samples: 52733
# Number of features: 4
# Category distribution:
# category
# women      14809
# house      12791
# men        10208
# bags        6268
# jewelry     4853
# beauty      3804
# Name: count, dtype: int64

# Data split:
# Training set: 36913 samples
# Test set: 15820 samples

# === NAIVE BAYES CLASSIFICATION ===
# Naive Bayes Results:
# Accuracy: 0.4335

# Detailed Classification Report:
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
#               precision    recall  f1-score   support

#         bags       0.22      0.05      0.08      1880
#       beauty       0.27      0.01      0.02      1141
#        house       0.44      0.50      0.47      3837
#      jewelry       0.00      0.00      0.00      1456
#          men       0.63      0.29      0.40      3063
#        women       0.41      0.89      0.56      4443

#     accuracy                           0.43     15820
#    macro avg       0.33      0.29      0.25     15820
# weighted avg       0.39      0.43      0.36     15820


# Confusion Matrix:
# [[  88    2  241    0  152 1397]
#  [  26   11  511    0   76  517]
#  [  85   17 1931    0  193 1611]
#  [  30    4  871    0   36  515]
#  [  83    6  487    0  890 1597]
#  [  83    1  354    0   67 3938]]

# === LOGISTIC REGRESSION CLASSIFICATION ===
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# Logistic Regression Results:
# Accuracy: 0.4573

# Detailed Classification Report:
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
#               precision    recall  f1-score   support

#         bags       0.23      0.02      0.04      1880
#       beauty       0.00      0.00      0.00      1141
#        house       0.41      0.54      0.47      3837
#      jewelry       0.34      0.02      0.03      1456
#          men       0.52      0.46      0.49      3063
#        women       0.47      0.83      0.60      4443

#     accuracy                           0.46     15820
#    macro avg       0.33      0.31      0.27     15820
# weighted avg       0.39      0.46      0.38     15820


# Confusion Matrix:
# [[  43    0  330    0  327 1180]
#  [  33    0  616   10  148  334]
#  [  42    0 2057   11  511 1216]
#  [  21    0  991   25   38  381]
#  [  21    0  565   15 1418 1044]
#  [  24    0  424   13  290 3692]]

# === MODEL COMPARISON ===
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# Naive Bayes Cross-Validation Accuracy: 0.3987 (+/- 0.1198)
# Logistic Regression Cross-Validation Accuracy: 0.4434 (+/- 0.0743)

# Comparison Summary:
#           Algorithm  Test Accuracy  CV Mean Accuracy  CV Std Deviation
#         Naive Bayes       0.433502          0.398651          0.059882
# Logistic Regression       0.457332          0.443366          0.037144

# === FEATURE IMPORTANCE ANALYSIS ===
# Logistic Regression Coefficients by Category:
#                      bags  beauty   house  jewelry     men   women
# log_current_price  0.7217 -0.9830 -1.1979  -1.2537  0.4227  2.2902
# discount           0.4523 -0.6659 -0.8046  -0.4648 -0.5436  2.0266
# log_likes         -0.0038 -0.0949 -0.0026   0.0912 -0.1090  0.1191
# price_diff        -0.4018  1.4420  1.4781   1.2802 -0.5509 -3.2477

# === ASSIGNMENT OBJECTIVES: BEST CATEGORY & TOP PRODUCTS ===
# Category Performance Ranking:
#           avg_likes  avg_discount  category_score
# category
# women        238.17         55.16        0.860695
# bags         201.00         51.84        0.777488
# men          189.22         45.06        0.676679
# jewelry      166.15         53.13        0.628984
# beauty       150.27         50.44        0.628961
# house        164.48         49.07        0.627330

# Best Category: women

# Top 10 Products:
#       category  current_price  likes_count  discount  product_score
# 48216    women          19.99        21403        65          0.796
# 48191    women          27.99        17684        53          0.655
# 41673    women          29.99        17414        51          0.642
# 48864    women          25.99        10965        85          0.565
# 37934    women          13.99        14252        48          0.544
# 44311    women          15.35        11165        65          0.509
# 52115    women          38.84        12482        51          0.504
# 41176    women          12.99        12786        46          0.497
# 44416    women          29.99        11498        51          0.476
# 27721      men          20.99        11521        46          0.461

In [None]:
# Load your preprocessed data
df = pd.read_csv('cleaned_combined_dataset.csv')

# Run the complete Task 3 analysis
results = run_task3_classification(df)

# Access specific results for your report
print("Best Category:", results['best_category'])
print("Top 10 Products:")
print(results['top_10_products'])

CSCI946 Assignment 1 - Task 3: Classification Analysis
=== Data Preparation for Classification ===
Features selected: ['log_current_price', 'discount', 'log_likes', 'price_diff']
Target variable: category
Number of samples: 52733
Number of features: 4
Category distribution:
category
women      14809
house      12791
men        10208
bags        6268
jewelry     4853
beauty      3804
Name: count, dtype: int64

Data split:
Training set: 36913 samples
Test set: 15820 samples

=== NAIVE BAYES CLASSIFICATION ===
Naive Bayes Results:
Accuracy: 0.4335

Detailed Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

        bags       0.22      0.05      0.08      1880
      beauty       0.27      0.01      0.02      1141
       house       0.44      0.50      0.47      3837
     jewelry       0.00      0.00      0.00      1456
         men       0.63      0.29      0.40      3063
       women       0.41      0.89      0.56      4443

    accuracy                           0.43     15820
   macro avg       0.33      0.29      0.25     15820
weighted avg       0.39      0.43      0.36     15820


Confusion Matrix:
[[  88    2  241    0  152 1397]
 [  26   11  511    0   76  517]
 [  85   17 1931    0  193 1611]
 [  30    4  871    0   36  515]
 [  83    6  487    0  890 1597]
 [  83    1  354    0   67 3938]]

=== LOGISTIC REGRESSION CLASSIFICATION ===




Logistic Regression Results:
Accuracy: 0.4573

Detailed Classification Report:
              precision    recall  f1-score   support

        bags       0.23      0.02      0.04      1880
      beauty       0.00      0.00      0.00      1141
       house       0.41      0.54      0.47      3837
     jewelry       0.34      0.02      0.03      1456
         men       0.52      0.46      0.49      3063
       women       0.47      0.83      0.60      4443

    accuracy                           0.46     15820
   macro avg       0.33      0.31      0.27     15820
weighted avg       0.39      0.46      0.38     15820


Confusion Matrix:
[[  43    0  330    0  327 1180]
 [  33    0  616   10  148  334]
 [  42    0 2057   11  511 1216]
 [  21    0  991   25   38  381]
 [  21    0  565   15 1418 1044]
 [  24    0  424   13  290 3692]]

=== MODEL COMPARISON ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Naive Bayes Cross-Validation Accuracy: 0.3987 (+/- 0.1198)
Logistic Regression Cross-Validation Accuracy: 0.4434 (+/- 0.0743)

Comparison Summary:
          Algorithm  Test Accuracy  CV Mean Accuracy  CV Std Deviation
        Naive Bayes       0.433502          0.398651          0.059882
Logistic Regression       0.457332          0.443366          0.037144

=== FEATURE IMPORTANCE ANALYSIS ===
Logistic Regression Coefficients by Category:
                     bags  beauty   house  jewelry     men   women
log_current_price  0.7217 -0.9830 -1.1979  -1.2537  0.4227  2.2902
discount           0.4523 -0.6659 -0.8046  -0.4648 -0.5436  2.0266
log_likes         -0.0038 -0.0949 -0.0026   0.0912 -0.1090  0.1191
price_diff        -0.4018  1.4420  1.4781   1.2802 -0.5509 -3.2477

=== ASSIGNMENT OBJECTIVES: BEST CATEGORY & TOP PRODUCTS ===
Category Performance Ranking:
          avg_likes  avg_discount  category_score
category                                         
women        238.17         55.

In [None]:
# CSCI946 Assignment 1 - Task 3: Classification Analysis
# ============================================================
# === Data Preparation for Classification ===
# Features selected: ['log_current_price', 'discount', 'log_likes', 'price_diff']
# Target variable: category
# Number of samples: 52733
# Number of features: 4
# Category distribution:
# category
# women      14809
# house      12791
# men        10208
# bags        6268
# jewelry     4853
# beauty      3804
# Name: count, dtype: int64

# Data split:
# Training set: 36913 samples
# Test set: 15820 samples

# === NAIVE BAYES CLASSIFICATION ===
# Naive Bayes Results:
# Accuracy: 0.4335

# Detailed Classification Report:
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
#               precision    recall  f1-score   support

#         bags       0.22      0.05      0.08      1880
#       beauty       0.27      0.01      0.02      1141
#        house       0.44      0.50      0.47      3837
#      jewelry       0.00      0.00      0.00      1456
#          men       0.63      0.29      0.40      3063
#        women       0.41      0.89      0.56      4443

#     accuracy                           0.43     15820
#    macro avg       0.33      0.29      0.25     15820
# weighted avg       0.39      0.43      0.36     15820


# Confusion Matrix:
# [[  88    2  241    0  152 1397]
#  [  26   11  511    0   76  517]
#  [  85   17 1931    0  193 1611]
#  [  30    4  871    0   36  515]
#  [  83    6  487    0  890 1597]
#  [  83    1  354    0   67 3938]]

# === LOGISTIC REGRESSION CLASSIFICATION ===
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# Logistic Regression Results:
# Accuracy: 0.4573

# Detailed Classification Report:
#               precision    recall  f1-score   support

#         bags       0.23      0.02      0.04      1880
#       beauty       0.00      0.00      0.00      1141
#        house       0.41      0.54      0.47      3837
#      jewelry       0.34      0.02      0.03      1456
#          men       0.52      0.46      0.49      3063
#        women       0.47      0.83      0.60      4443

#     accuracy                           0.46     15820
#    macro avg       0.33      0.31      0.27     15820
# weighted avg       0.39      0.46      0.38     15820


# Confusion Matrix:
# [[  43    0  330    0  327 1180]
#  [  33    0  616   10  148  334]
#  [  42    0 2057   11  511 1216]
#  [  21    0  991   25   38  381]
#  [  21    0  565   15 1418 1044]
#  [  24    0  424   13  290 3692]]

# === MODEL COMPARISON ===
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# /usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# /usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
#   warnings.warn(
# Naive Bayes Cross-Validation Accuracy: 0.3987 (+/- 0.1198)
# Logistic Regression Cross-Validation Accuracy: 0.4434 (+/- 0.0743)

# Comparison Summary:
#           Algorithm  Test Accuracy  CV Mean Accuracy  CV Std Deviation
#         Naive Bayes       0.433502          0.398651          0.059882
# Logistic Regression       0.457332          0.443366          0.037144

# === FEATURE IMPORTANCE ANALYSIS ===
# Logistic Regression Coefficients by Category:
#                      bags  beauty   house  jewelry     men   women
# log_current_price  0.7217 -0.9830 -1.1979  -1.2537  0.4227  2.2902
# discount           0.4523 -0.6659 -0.8046  -0.4648 -0.5436  2.0266
# log_likes         -0.0038 -0.0949 -0.0026   0.0912 -0.1090  0.1191
# price_diff        -0.4018  1.4420  1.4781   1.2802 -0.5509 -3.2477

# === ASSIGNMENT OBJECTIVES: BEST CATEGORY & TOP PRODUCTS ===
# Category Performance Ranking:
#           avg_likes  avg_discount  category_score
# category
# women        238.17         55.16        0.860695
# bags         201.00         51.84        0.777488
# men          189.22         45.06        0.676679
# jewelry      166.15         53.13        0.628984
# beauty       150.27         50.44        0.628961
# house        164.48         49.07        0.627330

# Best Category: women

# Top 10 Products:
#       category  current_price  likes_count  discount  product_score
# 48216    women          19.99        21403        65          0.796
# 48191    women          27.99        17684        53          0.655
# 41673    women          29.99        17414        51          0.642
# 48864    women          25.99        10965        85          0.565
# 37934    women          13.99        14252        48          0.544
# 44311    women          15.35        11165        65          0.509
# 52115    women          38.84        12482        51          0.504
# 41176    women          12.99        12786        46          0.497
# 44416    women          29.99        11498        51          0.476
# 27721      men          20.99        11521        46          0.461
# Best Category: women
# Top 10 Products:
#       category  current_price  likes_count  discount  product_score
# 48216    women          19.99        21403        65          0.796
# 48191    women          27.99        17684        53          0.655
# 41673    women          29.99        17414        51          0.642
# 48864    women          25.99        10965        85          0.565
# 37934    women          13.99        14252        48          0.544
# 44311    women          15.35        11165        65          0.509
# 52115    women          38.84        12482        51          0.504
# 41176    women          12.99        12786        46          0.497
# 44416    women          29.99        11498        51          0.476
# 27721      men          20.99        11521        46          0.461