In [4]:
import pandas as pd
import numpy as np
import os
import joblib
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score, f1_score, hamming_loss, 
    mean_absolute_error, precision_score, recall_score
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from typing import Dict, List, Tuple, Any
from tqdm import tqdm
from tabulate import tabulate

from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# -------------------- Core Classes --------------------
class DataLoader:
    @staticmethod
    def load_data(file_path: str) -> Tuple[pd.Series, np.ndarray, MultiLabelBinarizer]:
        """Load and clean data with multi-label support"""
        df = pd.read_csv(file_path)
        
        # Handle missing values
        df = df.dropna(subset=['description', 'categories'])
        df['description'] = df['description'].fillna('').str.strip()
        
        # Convert categories to binary matrix
        df['categories'] = df['categories'].astype(str)
        df['categories'] = df['categories'].str.split(', ')  # Assuming comma-separated labels
        
        # Create MultiLabelBinarizer
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(df['categories'])
        
        return df['description'], y, mlb

class Preprocessor(TransformerMixin, BaseEstimator):
    """Enhanced preprocessor with basic text cleaning"""
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            min_df=2,
            max_df=0.95
        )
        
    def fit(self, X, y=None):
        # Convert to pandas Series for consistent handling
        X = pd.Series(X).astype(str)
        self.vectorizer.fit(X)
        return self
    
    def transform(self, X):
        # Ensure input is always treated as a pandas Series
        X = pd.Series(X).astype(str)
        return self.vectorizer.transform(X)

class ModelTrainer:
    def __init__(self, models: Dict[str, Any], metrics: Dict[str, Any]):
        self.models = models
        self.metrics = metrics
        self.results = []
        
    def train_and_evaluate(self, X_train, X_test, y_train, y_test):
        """Train and evaluate all models with extended metrics"""
        with tqdm(total=len(self.models), desc="Training models") as model_pbar:
            for model_name, model in self.models.items():
                model_pbar.set_description(f"Training {model_name}")
                
                # Training
                model.fit(X_train, y_train)
                
                # Predictions
                predictions = model.predict(X_test)
                
                # Calculate probabilities for MAE (if available)
                prob_predictions = (model.predict_proba(X_test) 
                                    if hasattr(model, 'predict_proba') 
                                    else None)
                
                # Calculate all metrics
                metrics_results = {}
                for metric_name, metric_func in self.metrics.items():
                    try:
                        if metric_name == 'mae' and prob_predictions is not None:
                            metrics_results[metric_name] = mean_absolute_error(
                                y_test, prob_predictions[:, 1] if prob_predictions.shape[1] == 2 else prob_predictions
                            )
                        else:
                            metrics_results[metric_name] = metric_func(y_true=y_test, y_pred=predictions)
                    except Exception as e:
                        print(f"Error calculating {metric_name}: {str(e)}")
                        metrics_results[metric_name] = np.nan
                
                self.results.append({
                    'model': model_name,
                    **metrics_results,
                    'instance': model
                })
                model_pbar.update(1)

class ExperimentRunner:
    def __init__(self, config: Dict):
        self.config = config
        
    def run(self, X, y):
        """Run complete experiment pipeline"""
        # Split data - keep original text data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.config['test_size'], random_state=42
        )
        
        # Preprocess data - create transformed versions
        preprocessor = self.config['preprocessor']()
        X_train_transformed = preprocessor.fit_transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)
        
        # Initialize and run trainer with transformed data
        trainer = ModelTrainer(self.config['models'], self.config['metrics'])
        trainer.train_and_evaluate(X_train_transformed, X_test_transformed, y_train, y_test)
        
        # Save models and preprocessor
        if 'save_dir' in self.config:
            save_models(
                trainer.results,
                preprocessor,
                self.config['mlb'],
                self.config['save_dir']
            )
        
        # Return original text data for evaluation
        return trainer.results, X_test, y_test


# -------------------- Utility Functions --------------------
def save_models(results: List[Dict], preprocessor: Any, mlb: MultiLabelBinarizer, save_dir: str):
    """Save trained models and preprocessor to disk"""
    os.makedirs(save_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    try:
        # Save preprocessor and MLB
        joblib.dump(preprocessor, f"{save_dir}/preprocessor_{timestamp}.pkl")
        joblib.dump(mlb, f"{save_dir}/mlb_{timestamp}.pkl")
        
        # Save models
        for result in results:
            model_name = result['model'].replace(' ', '_').lower()
            filename = f"{save_dir}/{model_name}_{timestamp}.pkl"
            joblib.dump(result['instance'], filename)
            
        print(f"\nAll resources saved with timestamp: {timestamp}")
    except Exception as e:
        print(f"Error saving models: {str(e)}")

def print_results(results: List[Dict], sort_by: str = 'subset_accuracy'):
    """Print results with pretty formatting and highlight best values."""
    metric_names = {
        'hamming_loss': ('Hamming Loss', 'min'),
        'subset_accuracy': ('Accuracy', 'max'), 
        'micro_f1': ('Micro F1', 'max'),
        'macro_f1': ('Macro F1', 'max'),
        'mae': ('MAE', 'min')
    }

    # Prepare table data
    headers = ["Model"] + [name for name, _ in metric_names.values()]
    table_data = []

    # Identify best values for each metric
    best_values = {}
    for metric, (_, direction) in metric_names.items():
        metric_values = [res.get(metric, np.nan) for res in results]
        if direction == 'max':
            best_values[metric] = np.nanmax(metric_values)
        elif direction == 'min':
            best_values[metric] = np.nanmin(metric_values)

    # Populate table rows
    for res in results:
        row = [res['model']]
        for metric, (metric_name, _) in metric_names.items():
            value = res.get(metric, np.nan)
            if isinstance(value, float):
                formatted_value = f"{value:.4f}" if 'Loss' in metric_name or 'MAE' in metric_name else f"{value:.2%}"

                # Highlight if this value is the best
                if value == best_values[metric]:
                    formatted_value = f"\033[1m\033[92m{formatted_value}\033[0m"  # Bold green text

                row.append(formatted_value)
            else:
                row.append("N/A")
        table_data.append(row)

    # Print formatted table
    print("\n" + "="*80)
    print("FINAL RESULTS".center(80))
    print("="*80)
    print(tabulate(table_data, headers=headers, tablefmt="grid", stralign="center"))

def load_resources(model_path: str, preprocessor_path: str, mlb_path: str) -> tuple:
    """Load saved model, preprocessor, and label binarizer"""
    try:
        return (
            joblib.load(model_path),
            joblib.load(preprocessor_path),
            joblib.load(mlb_path)
        )
    except Exception as e:
        raise ValueError(f"Error loading resources: {str(e)}")

def evaluate_loaded_model(model: Any, preprocessor: Any, mlb: MultiLabelBinarizer,
                         X_test: pd.Series, y_test: np.ndarray) -> None:
    """Evaluate and print metrics for a loaded model"""
    # Preprocess test data
    X_test_transformed = preprocessor.transform(X_test)
    
    # Get predictions
    predictions = model.predict(X_test_transformed)
    
    # Calculate metrics
    results = {
        'hamming_loss': hamming_loss(y_test, predictions),
        'subset_accuracy': accuracy_score(y_test, predictions),
        'micro_f1': f1_score(y_test, predictions, average='micro'),
        'macro_f1': f1_score(y_test, predictions, average='macro'),
    }
    
    # Handle MAE separately
    try:
        prob_predictions = model.predict_proba(X_test_transformed)
        results['mae'] = mean_absolute_error(
            y_test, prob_predictions[:, 1] if prob_predictions.shape[1] == 2 else prob_predictions
        )
    except AttributeError:
        results['mae'] = np.nan
    
    # Print results
    print_results([{'model': 'Loaded Model', **results}])

def predict_genre(model: Any, preprocessor: Any, mlb: MultiLabelBinarizer,
                 synopsis: str, threshold: float = 0.5) -> dict:
    """Predict genre for a single synopsis"""
    # Preprocess text
    processed = preprocessor.transform([synopsis])
    
    # Make prediction
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(processed)
        prediction = (probabilities >= threshold).astype(int)
    else:
        prediction = model.predict(processed)
        probabilities = None
    
    # Convert to labels
    labels = mlb.inverse_transform(prediction)
    
    return {
        'prediction': labels[0],
        'probabilities': dict(zip(mlb.classes_, probabilities[0])) if probabilities is not None else None
    }

# -------------------- Configuration --------------------
CONFIG = {
    'test_size': 0.2,
    'save_dir': "../models/saved_models",
    'preprocessor': None,
    'models': {
        'Naive Bayes': OneVsRestClassifier(MultinomialNB()),
        'Logistic Reg': OneVsRestClassifier(LogisticRegression(max_iter=1000)),
        'SVM': OneVsRestClassifier(SVC(kernel='linear', probability=True)),
        'SGD Classifier': OneVsRestClassifier(
            SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, n_jobs=-1)
        ),
        # 'MLP Neural Net': OneVsRestClassifier(
        #     MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, early_stopping=True)
        # ),
        # 'XGBoost': OneVsRestClassifier(
        #     XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
        # ),
        'LightGBM': OneVsRestClassifier(
            LGBMClassifier(n_estimators=100, n_jobs=-1, verbose=-1)
        ),
        'Decision Tree': OneVsRestClassifier(
            DecisionTreeClassifier(max_depth=10, min_samples_split=10)
        ),
        'LinearSVC': OneVsRestClassifier(
            LinearSVC(dual=False, max_iter=10000, class_weight='balanced')
        )
    },
    'metrics': {
        'hamming_loss': hamming_loss,
        'subset_accuracy': accuracy_score,
        'micro_f1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='micro'),
        'macro_f1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='macro'),
        'mae': mean_absolute_error
    }
}
# -------------------- Main Execution --------------------
if __name__ == "__main__":
    # Load data
    X, y, mlb = DataLoader.load_data("../data/books_dataset.csv")
    CONFIG['preprocessor'] = Preprocessor
    CONFIG['mlb'] = mlb
    
    # Run experiment
    runner = ExperimentRunner(CONFIG)
    results, X_test_raw, y_test = runner.run(X, y)  # Now contains original text data
    
    # Show comparison
    print_results(results)
    
    # Save test data for later evaluation
    joblib.dump((X_test_raw, y_test), "test_data.pkl")
    print("Training complete. Models and test data saved.")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training LinearSVC: 100%|██████████| 7/7 [03:36<00:00, 30.86s/it]



All resources saved with timestamp: 20250127_185148

                                 FINAL RESULTS                                  
+----------------+----------------+------------+------------+------------+--------+
|     Model      |   Hamming Loss |  Accuracy  |  Micro F1  |  Macro F1  |    MAE |
|  Naive Bayes   |         0.0017 |   57.37%   |   67.81%   |   3.69%    | 0.0025 |
+----------------+----------------+------------+------------+------------+--------+
|  Logistic Reg  |         0.0016 |   58.23%   |   69.27%   |   1.90%    | 0.0033 |
+----------------+----------------+------------+------------+------------+--------+
|      SVM       |         0.0008 |   80.40%   |   86.47%   |   33.11%   | 0.0012 |
+----------------+----------------+------------+------------+------------+--------+
| SGD Classifier |         0.0013 |   65.59%   |   75.86%   |   7.85%    | 0.0063 |
+----------------+----------------+------------+------------+------------+--------+
|    LightGBM    |       

In [12]:
# -------------------- Inference Cell --------------------
from sklearn.preprocessing import MultiLabelBinarizer
from tabulate import tabulate

def load_resources(timestamp: str = "latest") -> list:
    """Load all saved models, preprocessor, and label binarizer for the latest timestamp."""
    try:
        if timestamp == "latest":
            # Find all preprocessor files
            files = [f for f in os.listdir("../models/saved_models") 
                    if f.startswith("preprocessor_") and f.endswith(".pkl")]
            
            if not files:
                raise ValueError("No saved models found")
            
            # Extract full timestamp from filename (preprocessor_YYYYMMDD_HHMMSS.pkl)
            latest_file = sorted(files)[-1]
            timestamp = latest_file.split('_')[1] + '_' + latest_file.split('_')[2].split('.')[0]
        
        # Find matching model files
        model_files = [f for f in os.listdir("../models/saved_models") 
                      if f.endswith(f"{timestamp}.pkl") and not f.startswith(('preprocessor_', 'mlb_'))]
        
        if not model_files:
            raise ValueError(f"No models found for timestamp {timestamp}")
        
        # Load preprocessor and MLB (assumed to be common across models)
        preprocessor = joblib.load(f"../models/saved_models/preprocessor_{timestamp}.pkl")
        mlb = joblib.load(f"../models/saved_models/mlb_{timestamp}.pkl")
        
        # Load all matching models
        models = [joblib.load(f"../models/saved_models/{model_file}") for model_file in model_files]
        
        return models, preprocessor, mlb
    except Exception as e:
        raise ValueError(f"Error loading resources: {str(e)}")

def evaluate_models(models, preprocessor, mlb):
    """Evaluate all models for the latest timestamp and display results in a single table."""
    try:
        # Load original test data
        X_test_raw, y_test = joblib.load("test_data.pkl")
        
        # Transform using the preprocessor
        X_test_transformed = preprocessor.transform(X_test_raw)
        
        # Collect results for all models
        evaluation_results = []

        for model in models:
            predictions = model.predict(X_test_transformed)
            
            results = {
                'Model': type(model).__name__,
                'Hamming Loss': hamming_loss(y_test, predictions),
                'Accuracy': accuracy_score(y_test, predictions),
                'Micro F1': f1_score(y_test, predictions, average='micro'),
                'Macro F1': f1_score(y_test, predictions, average='macro'),
            }
            
            try:
                prob_predictions = model.predict_proba(X_test_transformed)
                results['MAE'] = mean_absolute_error(y_test, prob_predictions)
            except AttributeError:
                results['MAE'] = "N/A"
                
            evaluation_results.append(results)
        
        # Prepare table data
        headers = ["Model", "Hamming Loss", "Accuracy", "Micro F1", "Macro F1", "MAE"]
        table_data = [
            [
                res['Model'],
                f"{res['Hamming Loss']:.4f}",
                f"{res['Accuracy']:.4f}",
                f"{res['Micro F1']:.4f}",
                f"{res['Macro F1']:.4f}",
                res['MAE'] if res['MAE'] == "N/A" else f"{res['MAE']:.4f}"
            ]
            for res in evaluation_results
        ]

        # Print table
        print("\n" + "="*80)
        print("MODEL EVALUATION RESULTS".center(80))
        print("="*80)
        print(tabulate(table_data, headers=headers, tablefmt="grid", stralign="center"))
        
    except FileNotFoundError:
        print("Test data not found. Run training cell first.")

def predict_genre(model, preprocessor, mlb, text: str, threshold: float = 0.5):
    """Interactive prediction function"""
    processed = preprocessor.transform([text])
    
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(processed)
        prediction = (probabilities >= threshold).astype(int)
        prob_dict = dict(zip(mlb.classes_, probabilities[0]))
    else:
        prediction = model.predict(processed)
        prob_dict = None
    
    labels = mlb.inverse_transform(prediction)
    
    print("\nPrediction Results:")
    print(f"Input text: {text[:100]}...")
    print(f"Predicted genres: {', '.join(labels[0]) or 'None'}")
    
    if prob_dict:
        print("\nClass Probabilities:")
        for genre, prob in sorted(prob_dict.items(), key=lambda x: x[1], reverse=True):
            print(f"{genre:<25} {prob:.2%}")

if __name__ == "__main__":
    # Load latest models
    models, preprocessor, mlb = load_resources()
    
    # Evaluate all models
    evaluate_models(models, preprocessor, mlb)
    
    # Make prediction using the first model as an example
    sample_text = "A space opera featuring interstellar politics, alien species, and epic battles between galactic empires"
    predict_genre(models[0], preprocessor, mlb, sample_text)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



                            MODEL EVALUATION RESULTS                            
+-----------+----------------+------------+------------+------------+--------+
|   Model   |   Hamming Loss |   Accuracy |   Micro F1 |   Macro F1 |  MAE   |
| Lightgbm  |         0.0015 |     0.7582 |     0.7872 |     0.3407 | 0.0019 |
+-----------+----------------+------------+------------+------------+--------+
| Linearsvc |         0.0005 |     0.887  |     0.9173 |     0.4674 |  N/A   |
+-----------+----------------+------------+------------+------------+--------+
|   Naive   |         0.0017 |     0.5737 |     0.6781 |     0.0369 | 0.0025 |
+-----------+----------------+------------+------------+------------+--------+
| Logistic  |         0.0016 |     0.5823 |     0.6927 |     0.019  | 0.0033 |
+-----------+----------------+------------+------------+------------+--------+
| Decision  |         0.0019 |     0.5787 |     0.7402 |     0.4273 | 0.0021 |
+-----------+----------------+------------+------

