In [26]:
import os
import joblib
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import (
    TfidfVectorizer, 
    CountVectorizer, 
    HashingVectorizer
)
from sklearn.metrics import (
    accuracy_score, f1_score, hamming_loss, 
    mean_absolute_error, precision_score, recall_score
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from typing import Dict, List, Tuple, Any
from tqdm import tqdm
from tabulate import tabulate

from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np

import itertools

# -------------------- Core Classes --------------------
class DataLoader:
    @staticmethod
    def load_data(file_path: str) -> Tuple[pd.Series, np.ndarray, MultiLabelBinarizer]:
        """Load and clean data with multi-label support"""
        df = pd.read_csv(file_path)
        
        # Handle missing values
        df = df.dropna(subset=['description', 'categories'])
        df['description'] = df['description'].fillna('').str.strip()
        
        # Convert categories to binary matrix
        df['categories'] = df['categories'].astype(str)
        df['categories'] = df['categories'].str.split(', ')  # Assuming comma-separated labels
        
        # Create MultiLabelBinarizer
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(df['categories'])
        
        return df['description'], y, mlb

class VectorizerFactory:
    """Factory class to create different vectorizers with parameter validation"""
    @staticmethod
    def get_vectorizer(vectorizer_config: Dict) -> Any:
        vectorizer_type = vectorizer_config['type']
        params = vectorizer_config.get('params', {}).copy()

        # Create a separate dictionary for each vectorizer
        vectorizers = {
            'tfidf': TfidfVectorizer,
            'count': CountVectorizer,
            'hashing': HashingVectorizer
        }

        if vectorizer_type == 'hashing':
            # Rename max_features to n_features for hashing vectorizer only
            if 'max_features' in params:
                params['n_features'] = params.pop('max_features')
            # Remove incompatible parameters
            for param in ['min_df', 'max_df']:
                params.pop(param, None)

        if vectorizer_type not in vectorizers:
            raise ValueError(f"Unsupported vectorizer type: {vectorizer_type}")

        return vectorizers[vectorizer_type](**params)  # Instantiate with parameters



class Preprocessor(TransformerMixin, BaseEstimator):
    """Modular preprocessor with configurable vectorizer"""
    def __init__(self, vectorizer_config: Dict):
        self.vectorizer = VectorizerFactory.get_vectorizer(vectorizer_config)
        
    def fit(self, X, y=None):
        X = pd.Series(X).astype(str)
        self.vectorizer.fit(X)
        return self
    
    def transform(self, X):
        X = pd.Series(X).astype(str)
        return self.vectorizer.transform(X)

class ModelTrainer:
    def __init__(self, models: Dict[str, Any], metrics: Dict[str, Any]):
        self.models = models
        self.metrics = metrics
        self.results = []
        
    def train_and_evaluate(self, X_train, X_test, y_train, y_test):
        """Train and evaluate all models with extended metrics"""
        with tqdm(total=len(self.models), desc="Training models") as model_pbar:
            for model_name, model in self.models.items():
                model_pbar.set_description(f"Training {model_name}")
                
                # Training
                model.fit(X_train, y_train)
                
                # Predictions
                predictions = model.predict(X_test)
                
                # Calculate probabilities for MAE (if available)
                prob_predictions = (model.predict_proba(X_test) 
                                    if hasattr(model, 'predict_proba') 
                                    else None)
                
                # Calculate all metrics
                metrics_results = {}
                for metric_name, metric_func in self.metrics.items():
                    try:
                        if metric_name == 'mae' and prob_predictions is not None:
                            metrics_results[metric_name] = mean_absolute_error(
                                y_test, prob_predictions[:, 1] if prob_predictions.shape[1] == 2 else prob_predictions
                            )
                        else:
                            metrics_results[metric_name] = metric_func(y_true=y_test, y_pred=predictions)
                    except Exception as e:
                        print(f"Error calculating {metric_name}: {str(e)}")
                        metrics_results[metric_name] = np.nan
                
                self.results.append({
                    'model': model_name,
                    **metrics_results,
                    'instance': model
                })
                model_pbar.update(1)

class ExperimentRunner:
    def __init__(self, config: Dict):
        self.config = config
        
    def run(self, X, y, model_name: str = None):
        """Run complete experiment pipeline"""
        # Split data - keep original text data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.config['test_size'], random_state=42
        )
        
        # Initialize preprocessor with config
        preprocessor = Preprocessor(self.config['vectorizer'])
        
        # Transform data
        X_train_transformed = preprocessor.fit_transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)

        # Convert to float32 for LightGBM compatibility
        X_train_transformed = X_train_transformed.astype(np.float32)
        X_test_transformed = X_test_transformed.astype(np.float32)

        # Fix for Naive Bayes when using HashingVectorizer
        if self.config['vectorizer']['type'] == 'hashing' and isinstance(list(self.config['models'].values())[0], OneVsRestClassifier):
            base_estimator = list(self.config['models'].values())[0].estimator
            if isinstance(base_estimator, MultinomialNB):
                print("Applying absolute value transformation to avoid negative values for MultinomialNB with HashingVectorizer")
                X_train_transformed = np.abs(X_train_transformed)
                X_test_transformed = np.abs(X_test_transformed)

        # Initialize and run trainer with transformed data
        trainer = ModelTrainer(self.config['models'], self.config['metrics'])
        trainer.train_and_evaluate(X_train_transformed, X_test_transformed, y_train, y_test)
        
        # Initialize and run trainer with transformed data
        trainer = ModelTrainer(self.config['models'], self.config['metrics'])
        trainer.train_and_evaluate(X_train_transformed, X_test_transformed, y_train, y_test)
        
        # Save models and preprocessor
        if 'save_dir' in self.config:
            save_models(
                trainer.results,
                preprocessor,
                self.config['mlb'],
                self.config['save_dir'],
                model_name
            )
        
        return trainer.results, X_test, y_test


# -------------------- Utility Functions --------------------
def save_models(results: List[Dict], preprocessor: Any, mlb: MultiLabelBinarizer, save_dir: str, vectorizer_name: str):
    """Save trained models and preprocessor to disk"""
    os.makedirs(save_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    try:
        # Save preprocessor and MLB
        joblib.dump(preprocessor, f"{save_dir}/preprocessor_{vectorizer_name}_{timestamp}.pkl")
        joblib.dump(mlb, f"{save_dir}/mlb_{vectorizer_name}_{timestamp}.pkl")
        
        # Save models
        for result in results:
            model_name = result['model'].replace(' ', '_').lower()
            filename = f"{save_dir}/{vectorizer_name}_{model_name}_{timestamp}.pkl"
            joblib.dump(result['instance'], filename)
        
        print(f"\nAll resources saved with timestamp: {timestamp}")
    except Exception as e:
        print(f"Error saving models: {str(e)}")

def print_results(results: List[Dict], sort_by: str = 'subset_accuracy'):
    """Print results with pretty formatting and highlight best values."""
    metric_names = {
        'hamming_loss': ('Hamming Loss', 'min'),
        'subset_accuracy': ('Accuracy', 'max'), 
        'micro_f1': ('Micro F1', 'max'),
        'macro_f1': ('Macro F1', 'max'),
        'mae': ('MAE', 'min')
    }

    # Prepare table data
    headers = ["Vectorizer", "Model"] + [name for name, _ in metric_names.values()]
    table_data = []

    # Identify best values for each metric
    best_values = {}
    for metric, (_, direction) in metric_names.items():
        metric_values = [res.get(metric, np.nan) for res in results]
        if direction == 'max':
            best_values[metric] = np.nanmax(metric_values)
        elif direction == 'min':
            best_values[metric] = np.nanmin(metric_values)

    # Populate table rows
    for res in results:
        row = [res['vectorizer'], res['model']]
        for metric, (metric_name, _) in metric_names.items():
            value = res.get(metric, np.nan)
            if isinstance(value, float):
                formatted_value = f"{value:.4f}" if 'Loss' in metric_name or 'MAE' in metric_name else f"{value:.2%}"
                # Highlight if this value is the best
                if value == best_values[metric]:
                    formatted_value = f"\033[1m\033[92m{formatted_value}\033[0m"  # Bold green text
                row.append(formatted_value)
            else:
                row.append("N/A")
        table_data.append(row)

    # Print formatted table
    print("\n" + "="*80)
    print("FINAL RESULTS".center(80))
    print("="*80)
    print(tabulate(table_data, headers=headers, tablefmt="grid", stralign="center"))

def load_resources(model_path: str, preprocessor_path: str, mlb_path: str) -> tuple:
    """Load saved model, preprocessor, and label binarizer"""
    try:
        return (
            joblib.load(model_path),
            joblib.load(preprocessor_path),
            joblib.load(mlb_path)
        )
    except Exception as e:
        raise ValueError(f"Error loading resources: {str(e)}")

def evaluate_loaded_model(model: Any, preprocessor: Any, mlb: MultiLabelBinarizer,
                         X_test: pd.Series, y_test: np.ndarray) -> None:
    """Evaluate and print metrics for a loaded model"""
    # Preprocess test data
    X_test_transformed = preprocessor.transform(X_test)
    
    # Get predictions
    predictions = model.predict(X_test_transformed)
    
    # Calculate metrics
    results = {
        'hamming_loss': hamming_loss(y_test, predictions),
        'subset_accuracy': accuracy_score(y_test, predictions),
        'micro_f1': f1_score(y_test, predictions, average='micro'),
        'macro_f1': f1_score(y_test, predictions, average='macro'),
    }
    
    # Handle MAE separately
    try:
        prob_predictions = model.predict_proba(X_test_transformed)
        results['mae'] = mean_absolute_error(
            y_test, prob_predictions[:, 1] if prob_predictions.shape[1] == 2 else prob_predictions
        )
    except AttributeError:
        results['mae'] = np.nan
    
    # Print results
    print_results([{'model': 'Loaded Model', **results}])

def predict_genre(model: Any, preprocessor: Any, mlb: MultiLabelBinarizer,
                 synopsis: str, threshold: float = 0.5) -> dict:
    """Predict genre for a single synopsis"""
    # Preprocess text
    processed = preprocessor.transform([synopsis])
    
    # Make prediction
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(processed)
        prediction = (probabilities >= threshold).astype(int)
    else:
        prediction = model.predict(processed)
        probabilities = None
    
    # Convert to labels
    labels = mlb.inverse_transform(prediction)
    
    return {
        'prediction': labels[0],
        'probabilities': dict(zip(mlb.classes_, probabilities[0])) if probabilities is not None else None
    }

# -------------------- Configuration --------------------
# Define vectorization methods
VECTORIZERS = {
    'tfidf': {
        'type': 'tfidf',
        'params': {
            'stop_words': 'english',
            'max_features': 5000,
            'min_df': 2,
            'max_df': 0.95
        }
    },
    'count': {
        'type': 'count',
        'params': {
            'stop_words': 'english',
            'max_features': 5000,
            'min_df': 2,
            'max_df': 0.95
        }
    },
    'hashing': {
        'type': 'hashing',
        'params': {
            'n_features': 5000  # HashingVectorizer uses n_features instead of max_features
        }
    }
}



# Define classifiers
CLASSIFIERS = {
    'Naive Bayes': OneVsRestClassifier(MultinomialNB()),
    'Logistic Reg': OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    'SVM': OneVsRestClassifier(SVC(kernel='linear', probability=True)),
    'SGD Classifier': OneVsRestClassifier(
        SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, n_jobs=-1)
    ),
    'LightGBM': OneVsRestClassifier(
        LGBMClassifier(n_estimators=100, n_jobs=-1, verbose=-1)
    ),
    'Decision Tree': OneVsRestClassifier(
        DecisionTreeClassifier(max_depth=10, min_samples_split=10)
    ),
    'LinearSVC': OneVsRestClassifier(
        LinearSVC(dual=False, max_iter=10000, class_weight='balanced')
    )
}



# CONFIG = {
#     'test_size': 0.2,
#     'save_dir': "../models/saved_models",
#     'vectorizer': {
#         'type': 'tfidf',  # Switch to TF-IDF vectorization
#         'params': {
#             'stop_words': 'english',
#             'max_features': 5000,
#             'min_df': 2,
#             'max_df': 0.95
#         }
#     },
#     'models': {
#         'Naive Bayes': OneVsRestClassifier(MultinomialNB()),
#         'Logistic Reg': OneVsRestClassifier(LogisticRegression(max_iter=1000)),
#         'SVM': OneVsRestClassifier(SVC(kernel='linear', probability=True)),
#         'SGD Classifier': OneVsRestClassifier(
#             SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, n_jobs=-1)
#         ),
#         'LightGBM': OneVsRestClassifier(
#             LGBMClassifier(n_estimators=100, n_jobs=-1, verbose=-1)
#         ),
#         'Decision Tree': OneVsRestClassifier(
#             DecisionTreeClassifier(max_depth=10, min_samples_split=10)
#         ),
#         'LinearSVC': OneVsRestClassifier(
#             LinearSVC(dual=False, max_iter=10000, class_weight='balanced')
#         )
#     },
#     'metrics': {
#         'hamming_loss': hamming_loss,
#         'subset_accuracy': accuracy_score,
#         'micro_f1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='micro'),
#         'macro_f1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='macro'),
#         'mae': mean_absolute_error
#     }
# }

In [27]:
# -------------------- Experiment Loop --------------------
# Load data
X, y, mlb = DataLoader.load_data("../data/books_dataset.csv")
all_results = []

for vectorizer_name, vectorizer_config in VECTORIZERS.items():
    for classifier_name, classifier in CLASSIFIERS.items():
        print(f"\nTesting {vectorizer_name} with {classifier_name}...")
        
        # Update config dynamically
        CONFIG['vectorizer'] = vectorizer_config
        CONFIG['models'] = {classifier_name: classifier}
        CONFIG['mlb'] = mlb
        
        # Run experiment
        runner = ExperimentRunner(CONFIG)
        results, _, _ = runner.run(X, y, vectorizer_name)
        
        # Append vectorizer info to results
        for result in results:
            result['vectorizer'] = vectorizer_name
            all_results.append(result)

        # Save models with vectorizer name
        save_models(results, runner.config['preprocessor'], mlb, CONFIG['save_dir'], vectorizer_name)

# Print and save final results
print_results(all_results)
joblib.dump(all_results, "all_experiment_results.pkl")
print("All experiments complete. Results saved.")



Testing tfidf with Naive Bayes...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Naive Bayes: 100%|██████████| 1/1 [00:02<00:00,  2.45s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Naive Bayes: 100%|██████████| 1/1 [00:02<00:00,  2.41s/it]



All resources saved with timestamp: 20250130_164949
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing tfidf with Logistic Reg...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Logistic Reg: 100%|██████████| 1/1 [00:03<00:00,  3.72s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Logistic Reg: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it]



All resources saved with timestamp: 20250130_164957
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing tfidf with SVM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SVM: 100%|██████████| 1/1 [01:22<00:00, 82.10s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SVM: 100%|██████████| 1/1 [01:22<00:00, 82.26s/it]



All resources saved with timestamp: 20250130_165241
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing tfidf with SGD Classifier...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SGD Classifier: 100%|██████████| 1/1 [00:02<00:00,  2.95s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SGD Classifier: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it]



All resources saved with timestamp: 20250130_165248
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing tfidf with LightGBM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LightGBM: 100%|██████████| 1/1 [01:16<00:00, 76.53s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LightGBM: 100%|██████████| 1/1 [01:21<00:00, 81.68s/it]



All resources saved with timestamp: 20250130_165526
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing tfidf with Decision Tree...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Decision Tree: 100%|██████████| 1/1 [00:05<00:00,  5.19s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Decision Tree: 100%|██████████| 1/1 [00:05<00:00,  5.20s/it]



All resources saved with timestamp: 20250130_165537
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing tfidf with LinearSVC...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LinearSVC: 100%|██████████| 1/1 [00:04<00:00,  4.27s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LinearSVC: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it]



All resources saved with timestamp: 20250130_165546
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing count with Naive Bayes...


Training Naive Bayes: 100%|██████████| 1/1 [00:02<00:00,  2.50s/it]
Training Naive Bayes: 100%|██████████| 1/1 [00:02<00:00,  2.42s/it]



All resources saved with timestamp: 20250130_165551
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing count with Logistic Reg...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Logistic Reg: 100%|██████████| 1/1 [00:03<00:00,  3.74s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Logistic Reg: 100%|██████████| 1/1 [00:03<00:00,  3.74s/it]



All resources saved with timestamp: 20250130_165559
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing count with SVM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SVM: 100%|██████████| 1/1 [01:36<00:00, 96.49s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SVM: 100%|██████████| 1/1 [01:36<00:00, 96.15s/it]



All resources saved with timestamp: 20250130_165912
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing count with SGD Classifier...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SGD Classifier: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SGD Classifier: 100%|██████████| 1/1 [00:02<00:00,  2.66s/it]



All resources saved with timestamp: 20250130_165918
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing count with LightGBM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LightGBM: 100%|██████████| 1/1 [00:59<00:00, 59.21s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LightGBM: 100%|██████████| 1/1 [00:59<00:00, 59.16s/it]



All resources saved with timestamp: 20250130_170116
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing count with Decision Tree...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Decision Tree: 100%|██████████| 1/1 [00:02<00:00,  2.79s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Decision Tree: 100%|██████████| 1/1 [00:02<00:00,  3.00s/it]



All resources saved with timestamp: 20250130_170123
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing count with LinearSVC...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LinearSVC: 100%|██████████| 1/1 [00:48<00:00, 48.84s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LinearSVC: 100%|██████████| 1/1 [00:48<00:00, 48.33s/it]



All resources saved with timestamp: 20250130_170300
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing hashing with Naive Bayes...
Applying absolute value transformation to avoid negative values for MultinomialNB with HashingVectorizer


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Naive Bayes: 100%|██████████| 1/1 [00:02<00:00,  2.71s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Naive Bayes: 100%|██████████| 1/1 [00:02<00:00,  2.52s/it]



All resources saved with timestamp: 20250130_170306
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing hashing with Logistic Reg...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Logistic Reg: 100%|██████████| 1/1 [00:04<00:00,  4.31s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Logistic Reg: 100%|██████████| 1/1 [00:04<00:00,  4.16s/it]



All resources saved with timestamp: 20250130_170314
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing hashing with SVM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SVM: 100%|██████████| 1/1 [02:29<00:00, 149.18s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SVM: 100%|██████████| 1/1 [02:29<00:00, 149.33s/it]



All resources saved with timestamp: 20250130_170813
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing hashing with SGD Classifier...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SGD Classifier: 100%|██████████| 1/1 [00:03<00:00,  3.27s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training SGD Classifier: 100%|██████████| 1/1 [00:03<00:00,  3.27s/it]



All resources saved with timestamp: 20250130_170820
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing hashing with LightGBM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LightGBM: 100%|██████████| 1/1 [01:47<00:00, 107.92s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LightGBM: 100%|██████████| 1/1 [01:50<00:00, 110.13s/it]



All resources saved with timestamp: 20250130_171158
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing hashing with Decision Tree...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Decision Tree: 100%|██████████| 1/1 [00:08<00:00,  8.42s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training Decision Tree: 100%|██████████| 1/1 [00:08<00:00,  8.37s/it]



All resources saved with timestamp: 20250130_171215
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

Testing hashing with LinearSVC...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LinearSVC: 100%|██████████| 1/1 [00:05<00:00,  5.24s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LinearSVC: 100%|██████████| 1/1 [00:05<00:00,  5.12s/it]



All resources saved with timestamp: 20250130_171226
Error saving models: Can't pickle <class '__main__.Preprocessor'>: it's not the same object as __main__.Preprocessor

                                 FINAL RESULTS                                  
+--------------+----------------+----------------+------------+------------+------------+--------+
|  Vectorizer  |     Model      |   Hamming Loss |  Accuracy  |  Micro F1  |  Macro F1  |    MAE |
|    tfidf     |  Naive Bayes   |         0.0017 |   57.37%   |   67.81%   |   3.69%    | 0.0025 |
+--------------+----------------+----------------+------------+------------+------------+--------+
|    tfidf     |  Logistic Reg  |         0.0016 |   58.23%   |   69.27%   |   1.90%    | 0.0033 |
+--------------+----------------+----------------+------------+------------+------------+--------+
|    tfidf     |      SVM       |         0.0008 |   80.40%   |   86.47%   |   33.11%   | 0.0012 |
+--------------+----------------+----------------+-----

In [1]:
# -------------------- Inference Cell --------------------
from sklearn.preprocessing import MultiLabelBinarizer
from tabulate import tabulate

def load_resources(timestamp: str = "latest") -> list:
    """Load all saved models, preprocessor, and label binarizer for the latest timestamp."""
    try:
        if timestamp == "latest":
            # Find all preprocessor files
            files = [f for f in os.listdir("../models/saved_models") 
                    if f.startswith("preprocessor_") and f.endswith(".pkl")]
            
            if not files:
                raise ValueError("No saved models found")
            
            # Extract full timestamp from filename (preprocessor_YYYYMMDD_HHMMSS.pkl)
            latest_file = sorted(files)[-1]
            timestamp = latest_file.split('_')[1] + '_' + latest_file.split('_')[2].split('.')[0]
        
        # Find matching model files
        model_files = [f for f in os.listdir("../models/saved_models") 
                      if f.endswith(f"{timestamp}.pkl") and not f.startswith(('preprocessor_', 'mlb_'))]
        
        if not model_files:
            raise ValueError(f"No models found for timestamp {timestamp}")
        
        # Load preprocessor and MLB (assumed to be common across models)
        preprocessor = joblib.load(f"../models/saved_models/preprocessor_{timestamp}.pkl")
        mlb = joblib.load(f"../models/saved_models/mlb_{timestamp}.pkl")
        
        # Load all matching models
        models = [joblib.load(f"../models/saved_models/{model_file}") for model_file in model_files]
        
        return models, preprocessor, mlb
    except Exception as e:
        raise ValueError(f"Error loading resources: {str(e)}")

def evaluate_models(models, preprocessor, mlb):
    """Evaluate all models for the latest timestamp and display results in a single table."""
    try:
        # Load original test data
        X_test_raw, y_test = joblib.load("test_data.pkl")
        
        # Transform using the preprocessor
        X_test_transformed = preprocessor.transform(X_test_raw)
        
        # Collect results for all models
        evaluation_results = []

        for model in models:
            predictions = model.predict(X_test_transformed)
            
            results = {
                'Model': type(model).__name__,
                'Hamming Loss': hamming_loss(y_test, predictions),
                'Accuracy': accuracy_score(y_test, predictions),
                'Micro F1': f1_score(y_test, predictions, average='micro'),
                'Macro F1': f1_score(y_test, predictions, average='macro'),
            }
            
            try:
                prob_predictions = model.predict_proba(X_test_transformed)
                results['MAE'] = mean_absolute_error(y_test, prob_predictions)
            except AttributeError:
                results['MAE'] = "N/A"
                
            evaluation_results.append(results)
        
        # Prepare table data
        headers = ["Model", "Hamming Loss", "Accuracy", "Micro F1", "Macro F1", "MAE"]
        table_data = [
            [
                res['Model'],
                f"{res['Hamming Loss']:.4f}",
                f"{res['Accuracy']:.4f}",
                f"{res['Micro F1']:.4f}",
                f"{res['Macro F1']:.4f}",
                res['MAE'] if res['MAE'] == "N/A" else f"{res['MAE']:.4f}"
            ]
            for res in evaluation_results
        ]

        # Print table
        print("\n" + "="*80)
        print("MODEL EVALUATION RESULTS".center(80))
        print("="*80)
        print(tabulate(table_data, headers=headers, tablefmt="grid", stralign="center"))
        
    except FileNotFoundError:
        print("Test data not found. Run training cell first.")

def predict_genre(model, preprocessor, mlb, text: str, threshold: float = 0.5):
    """Interactive prediction function"""
    processed = preprocessor.transform([text])
    
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(processed)
        prediction = (probabilities >= threshold).astype(int)
        prob_dict = dict(zip(mlb.classes_, probabilities[0]))
    else:
        prediction = model.predict(processed)
        prob_dict = None
    
    labels = mlb.inverse_transform(prediction)
    
    print("\nPrediction Results:")
    print(f"Input text: {text[:100]}...")
    print(f"Predicted genres: {', '.join(labels[0]) or 'None'}")
    
    if prob_dict:
        print("\nClass Probabilities:")
        for genre, prob in sorted(prob_dict.items(), key=lambda x: x[1], reverse=True):
            print(f"{genre:<25} {prob:.2%}")



In [7]:

if __name__ == "__main__":
    # Load latest models
    models, preprocessor, mlb = load_resources()
    
    # Evaluate all models
    evaluate_models(models, preprocessor, mlb)
    
    # Make prediction using the first model as an example
    sample_text = "A space opera featuring interstellar politics, alien species, and epic battles between galactic empires"
    predict_genre(models[0], preprocessor, mlb, sample_text)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



                            MODEL EVALUATION RESULTS                            
+---------------------+----------------+------------+------------+------------+--------+
|        Model        |   Hamming Loss |   Accuracy |   Micro F1 |   Macro F1 |  MAE   |
| OneVsRestClassifier |         0.0015 |     0.7625 |     0.7903 |     0.3407 | 0.0019 |
+---------------------+----------------+------------+------------+------------+--------+
| OneVsRestClassifier |         0.0005 |     0.887  |     0.9173 |     0.4674 |  N/A   |
+---------------------+----------------+------------+------------+------------+--------+
| OneVsRestClassifier |         0.0017 |     0.5737 |     0.6781 |     0.0369 | 0.0025 |
+---------------------+----------------+------------+------------+------------+--------+
| OneVsRestClassifier |         0.0016 |     0.5823 |     0.6927 |     0.019  | 0.0033 |
+---------------------+----------------+------------+------------+------------+--------+
| OneVsRestClassifier |     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
def predict_genre_from_path(model_path: str, synopsis: str, threshold: float = 0.5) -> dict:
    """
    Predict genres for a book synopsis using a saved model pipeline.
    
    Args:
        model_path (str): Path to the saved model file (.pkl)
        synopsis (str): Book description text to analyze
        threshold (float): Probability threshold for classification (default: 0.5)
    
    Returns:
        dict: Prediction results containing genres and probabilities
    """
    import os
    import joblib
    
    # Extract directory and filename components
    directory = os.path.dirname(model_path)
    filename = os.path.basename(model_path)
    
    # Extract timestamp from model filename
    try:
        filename_parts = filename.split('_')
        timestamp = '_'.join(filename_parts[-2:]).replace('.pkl', '')
    except:
        raise ValueError("Invalid model filename format. Expected format: 'modelname_YYYYMMDD_HHMMSS.pkl'")
    
    # Construct paths for preprocessor and label binarizer
    preprocessor_path = os.path.join(directory, f'preprocessor_{timestamp}.pkl')
    mlb_path = os.path.join(directory, f'mlb_{timestamp}.pkl')
    
    # Verify all required files exist
    for path in [model_path, preprocessor_path, mlb_path]:
        if not os.path.exists(path):
            raise FileNotFoundError(f"Required file missing: {path}")
    
    # Load resources
    model = joblib.load(model_path)
    preprocessor = joblib.load(preprocessor_path)
    mlb = joblib.load(mlb_path)
    
    # Preprocess and predict
    processed_text = preprocessor.transform([synopsis])
    
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(processed_text)
        prediction = (probabilities >= threshold).astype(int)
    else:
        prediction = model.predict(processed_text)
        probabilities = None
    
    # Convert to human-readable labels
    predicted_labels = mlb.inverse_transform(prediction)
    
    return {
        'genres': list(predicted_labels[0]),
        'probabilities': dict(zip(mlb.classes_, probabilities[0])) if probabilities is not None else None
    }

In [9]:

prediction = predict_genre_from_path(
    model_path= "../models/saved_models/svm_20250127_185148.pkl",
    synopsis="A thrilling mystery about a detective solving crimes in Victorian London"
)
print(prediction['genres'])

['Fiction']
