In [1]:
import pandas as pd
import numpy as np

In [15]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib # Import joblib for saving
import numpy as np # Import numpy

print("--- Running pipeline to prepare model and vectorizer for saving ---")

# Load the improved dataset
try:
    df = pd.read_csv("combined_cleaned.csv")

    # --- 1. Prepare Data ---
    df.dropna(subset=['generic_product_name', 'newCat'], inplace=True)
    X = df['generic_product_name'].astype(str)
    y = df['newCat']
    X_cleaned = X.str.lower()
    X_cleaned = X_cleaned.apply(lambda x: re.sub(r'[^\w\s]', '', x))
    X_cleaned = X_cleaned.apply(lambda x: re.sub(r'\s+', ' ', x).strip())

    # --- 2. Split Data ---
    # IMPORTANT: Use the same random_state for splitting as during evaluation
    X_train, X_test, y_train, y_test = train_test_split(
        X_cleaned, y,
        test_size=0.25, # Using same test size
        random_state=42, # Using same random state
        stratify=y
    )

    # --- 3. Feature Extraction (TF-IDF) ---
    # IMPORTANT: Use the same vectorizer settings
    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_features=5000, # Or maybe more if using n-grams
        ngram_range=(1, 2) # <--- Include single words AND two-word phrases
    )
    print("Fitting TF-IDF Vectorizer...")
    # Fit on the full training data prepared for saving
    # Note: We fit on X_train here, consistent with preparing for prediction deployment
    # where the vectorizer is fitted only on training data.
    vectorizer.fit(X_train)
    X_train_tfidf = vectorizer.transform(X_train) # Transform training data

    # --- 4. Model Training (Logistic Regression) ---
    # IMPORTANT: Use the same model settings
    print("Training Logistic Regression model...")
    model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42
        )
    # Train on the full training data prepared for saving
    model.fit(X_train_tfidf, y_train)
    print("Model training complete.")

    # --- 5. Save Model & Vectorizer ---
    model_filename = 'supermarket_classifier_logreg.joblib'
    vectorizer_filename = 'tfidf_vectorizer_logreg.joblib'

    print(f"\nSaving model to {model_filename}...")
    joblib.dump(model, model_filename)
    joblib.dump(model, '../../src/nimblist/Nimblist.classification/' + model_filename)

    print(f"Saving vectorizer to {vectorizer_filename}...")
    joblib.dump(vectorizer, vectorizer_filename)
    joblib.dump(vectorizer, '../../src/nimblist/Nimblist.classification/' + vectorizer_filename)

    print("\n--- Model and vectorizer saved successfully! ---")
    print(f"Files created: {model_filename}, {vectorizer_filename}")

except FileNotFoundError:
    print("Error: The file 'combined_cleaned.csv' was not found.")
except KeyError as e:
    print(f"Error: A column was not found. Please check column names. Details: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

--- Running pipeline to prepare model and vectorizer for saving ---
Fitting TF-IDF Vectorizer...
Training Logistic Regression model...
Model training complete.

Saving model to supermarket_classifier_logreg.joblib...
Saving vectorizer to tfidf_vectorizer_logreg.joblib...

--- Model and vectorizer saved successfully! ---
Files created: supermarket_classifier_logreg.joblib, tfidf_vectorizer_logreg.joblib


In [16]:
import pandas as pd
import re
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np # Import numpy

# --- Configuration ---
input_csv = "combined_cleaned.csv"
output_dir = "sub_category_models" # Directory to save models/vectorizers
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# --- Helper function to sanitize filenames ---
def sanitize_filename(name):
    # Remove invalid characters and replace spaces/& with underscores
    name = re.sub(r'[^\w\-]+', '_', name)
    # Remove leading/trailing underscores
    name = name.strip('_')
    return name

# --- Load Data ---
try:
    df = pd.read_csv(input_csv)
    print(f"Loaded dataset: {df.shape}")

    # Drop rows where essential columns might be missing (safety check)
    df.dropna(subset=['generic_product_name', 'newCat', 'newSubCat'], inplace=True)
    print(f"Dataset shape after dropping NA: {df.shape}")

    # Get unique primary categories
    primary_categories = df['newCat'].unique()
    print(f"\nFound {len(primary_categories)} primary categories to process.")

    # --- Loop through each primary category ---
    for primary_cat in primary_categories:
        print(f"\n--- Processing Primary Category: {primary_cat} ---")

        # Filter data for the current primary category
        df_sub = df[df['newCat'] == primary_cat].copy() # Use .copy() to avoid SettingWithCopyWarning

        # Check if enough data and multiple sub-categories exist
        if len(df_sub) < 10: # Arbitrary threshold for minimum samples
            print(f"Skipping '{primary_cat}': Insufficient data ({len(df_sub)} samples).")
            continue
        if df_sub['newSubCat'].nunique() < 2:
            print(f"Skipping '{primary_cat}': Only one sub-category found.")
            continue

        print(f"Found {len(df_sub)} samples and {df_sub['newSubCat'].nunique()} sub-categories.")

        # --- Prepare Data for Sub-model ---
        X = df_sub['generic_product_name'].astype(str)
        y = df_sub['newSubCat'] # Target is now the sub-category

        # Basic Text Cleaning
        X_cleaned = X.str.lower()
        X_cleaned = X_cleaned.apply(lambda x: re.sub(r'[^\w\s]', '', x))
        X_cleaned = X_cleaned.apply(lambda x: re.sub(r'\s+', ' ', x).strip())

        # --- Split Data ---
        try:
            # Try splitting with stratification
            X_train, X_test, y_train, y_test = train_test_split(
                X_cleaned, y,
                test_size=0.25,
                random_state=42,
                stratify=y # Stratify by sub-category
            )
        except ValueError as e:
            # If stratification fails (e.g., some sub-category has only 1 sample)
            print(f"Warning: Stratification failed for '{primary_cat}' ({e}). Splitting without stratification.")
            X_train, X_test, y_train, y_test = train_test_split(
                X_cleaned, y,
                test_size=0.25,
                random_state=42
                # No stratify
            )

        # --- Feature Extraction (TF-IDF) ---
        # Train a *new* vectorizer specifically for this subset
        vectorizer_sub = TfidfVectorizer(
            stop_words='english',
            max_features=5000 # Keep consistent for now, could be tuned per category
        )
        print("Fitting sub-category TF-IDF Vectorizer...")
        X_train_tfidf = vectorizer_sub.fit_transform(X_train)
        X_test_tfidf = vectorizer_sub.transform(X_test)

        # --- Model Training ---
        print("Training sub-category Logistic Regression model...")
        model_sub = LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            random_state=42
        )
        model_sub.fit(X_train_tfidf, y_train)

        # --- Evaluate Sub-Model (Optional but Recommended) ---
        print("Evaluating sub-category model...")
        y_pred_sub = model_sub.predict(X_test_tfidf)
        print(f"Classification Report for '{primary_cat}':")
        # Use zero_division=0 to avoid warnings for classes with no support in test set if split fails badly
        print(classification_report(y_test, y_pred_sub, zero_division=0))

        # --- Save Model & Vectorizer ---
        sanitized_cat_name = sanitize_filename(primary_cat)
        model_filename = os.path.join(output_dir, f'model_sub_{sanitized_cat_name}.joblib')
        vectorizer_filename = os.path.join(output_dir, f'vectorizer_sub_{sanitized_cat_name}.joblib')

        print(f"Saving sub-model to {model_filename}...")
        joblib.dump(model_sub, model_filename)
        joblib.dump(model_sub, '../../src/nimblist/Nimblist.classification/' + model_filename)
        print(f"Saving sub-vectorizer to {vectorizer_filename}...")
        joblib.dump(vectorizer_sub, vectorizer_filename)
        joblib.dump(vectorizer_sub, '../../src/nimblist/Nimblist.classification/' + vectorizer_filename)

    print("\n--- Finished processing all categories. ---")

except FileNotFoundError:
    print(f"Error: The file '{input_csv}' was not found.")
except KeyError as e:
    print(f"Error: A column was not found. Please check column names. Details: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Loaded dataset: (45285, 9)
Dataset shape after dropping NA: (45285, 9)

Found 12 primary categories to process.

--- Processing Primary Category: Fresh & Chilled ---
Found 8550 samples and 20 sub-categories.
Fitting sub-category TF-IDF Vectorizer...
Training sub-category Logistic Regression model...
Evaluating sub-category model...
Classification Report for 'Fresh & Chilled':
                                precision    recall  f1-score   support

                        Cheese       0.84      0.91      0.88       158
              Chilled Desserts       0.73      0.80      0.77        92
        Chilled Fish & Seafood       0.80      0.94      0.86        71
     Chilled Juice & Smoothies       0.87      0.83      0.85        82
    Chilled Vegetarian & Vegan       0.51      0.33      0.40        98
     Cooked Meats, Deli & Dips       0.72      0.58      0.64       187
            Dairy Alternatives       0.37      0.64      0.47        47
            Food To Go & Soups       0.75   

In [12]:
import joblib
import re
import pandas as pd
import numpy as np # Import numpy

# --- Configuration ---
# !!! REPLACE THESE WITH YOUR ACTUAL FILENAMES !!!
NGRAM_MODEL_PATH = 'supermarket_classifier_logreg.joblib'
NGRAM_VECTORIZER_PATH = 'tfidf_vectorizer_logreg.joblib'

# --- Text Cleaning Function (MUST match training) ---
def clean_text(text):
    if not isinstance(text, str):
        text = str(text) # Ensure input is string
    text = text.lower() # Lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# --- Load Model and Vectorizer ---
try:
    print(f"Loading n-gram vectorizer from: {NGRAM_VECTORIZER_PATH}")
    vectorizer = joblib.load(NGRAM_VECTORIZER_PATH)
    print(f"Loading n-gram model from: {NGRAM_MODEL_PATH}")
    model = joblib.load(NGRAM_MODEL_PATH)
    print("Model and vectorizer loaded successfully.")
except FileNotFoundError:
    print(f"Error: Could not find model or vectorizer files. \nPlease ensure '{NGRAM_MODEL_PATH}' and '{NGRAM_VECTORIZER_PATH}' exist.")
    model = None
    vectorizer = None
except Exception as e:
    print(f"An error occurred loading files: {e}")
    model = None
    vectorizer = None

# --- Predict Probabilities for specific input ---
if model and vectorizer:
    product_name = "Whole Milk"
    print(f"\n--- Checking probabilities for: '{product_name}' ---")

    # 1. Clean input text
    cleaned_name = clean_text(product_name)
    input_vector = [cleaned_name] # Vectorizer expects iterable

    # 2. Transform input using the loaded N-GRAM vectorizer
    try:
        features_tfidf = vectorizer.transform(input_vector)

        # 3. Get probabilities using the loaded N-GRAM model
        probabilities = model.predict_proba(features_tfidf)

        # `probabilities` is usually a 2D array like [[prob_class1, prob_class2,...]]
        # We need the first (and only) row for our single input
        if probabilities.shape[0] > 0:
            probs_for_input = probabilities[0]

            # Get class names in the order the model uses them
            class_names = model.classes_

            # Create a DataFrame for easy viewing and sorting
            prob_df = pd.DataFrame({
                'Category': class_names,
                'Probability': probs_for_input
            })

            # Sort by probability descending
            prob_df_sorted = prob_df.sort_values(by='Probability', ascending=False)

            print("\nPredicted Probabilities (Sorted):")
            print(prob_df_sorted.to_markdown(index=False, numalign="left", stralign="left"))

            # Optionally, print just the top N probabilities
            print("\nTop 3 Predictions:")
            print(prob_df_sorted.head(3).to_markdown(index=False, numalign="left", stralign="left"))

        else:
            print("Model did not return probabilities.")

    except Exception as e:
        print(f"An error occurred during prediction: {e}")
else:
    print("\nCannot proceed without loaded model and vectorizer.")

Loading n-gram vectorizer from: tfidf_vectorizer_logreg.joblib
Loading n-gram model from: supermarket_classifier_logreg.joblib
Model and vectorizer loaded successfully.

--- Checking probabilities for: 'Whole Milk' ---

Predicted Probabilities (Sorted):
| Category                  | Probability   |
|:--------------------------|:--------------|
| Baby & Toddler            | 0.230469      |
| Treats & Snacks           | 0.193704      |
| Fresh & Chilled           | 0.167285      |
| Tea, Coffee & Soft Drinks | 0.161831      |
| Pet                       | 0.0797833     |
| Food Cupboard             | 0.0659983     |
| Frozen                    | 0.0349855     |
| Health & Beauty           | 0.018315      |
| Bakery                    | 0.0165024     |
| Home                      | 0.0115154     |
| Household                 | 0.0105815     |
| Beer, Wine & Spirits      | 0.00902913    |

Top 3 Predictions:
| Category        | Probability   |
|:----------------|:--------------|
| Baby & T