In [None]:
import pandas as pd
import numpy as np
import os
import joblib
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score, f1_score, hamming_loss, 
    mean_absolute_error, precision_score, recall_score
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from typing import Dict, List, Tuple, Any
from tqdm import tqdm
from tabulate import tabulate

# -------------------- Core Classes --------------------
class DataLoader:
    @staticmethod
    def load_data(file_path: str) -> Tuple[pd.Series, np.ndarray, MultiLabelBinarizer]:
        """Load and clean data with multi-label support"""
        df = pd.read_csv(file_path)
        
        # Handle missing values
        df = df.dropna(subset=['description', 'categories'])
        df['description'] = df['description'].fillna('').str.strip()
        
        # Convert categories to binary matrix
        df['categories'] = df['categories'].astype(str)
        df['categories'] = df['categories'].str.split(', ')  # Assuming comma-separated labels
        
        # Create MultiLabelBinarizer
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(df['categories'])
        
        return df['description'], y, mlb

class Preprocessor(TransformerMixin, BaseEstimator):
    """Enhanced preprocessor with basic text cleaning"""
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            min_df=2,
            max_df=0.95
        )
        
    def fit(self, X, y=None):
        # Convert to pandas Series for consistent handling
        X = pd.Series(X).astype(str)
        self.vectorizer.fit(X)
        return self
    
    def transform(self, X):
        # Ensure input is always treated as a pandas Series
        X = pd.Series(X).astype(str)
        return self.vectorizer.transform(X)

class ModelTrainer:
    def __init__(self, models: Dict[str, Any], metrics: Dict[str, Any]):
        self.models = models
        self.metrics = metrics
        self.results = []
        
    def train_and_evaluate(self, X_train, X_test, y_train, y_test):
        """Train and evaluate all models with extended metrics"""
        with tqdm(total=len(self.models), desc="Training models") as model_pbar:
            for model_name, model in self.models.items():
                model_pbar.set_description(f"Training {model_name}")
                
                # Training
                model.fit(X_train, y_train)
                
                # Predictions
                predictions = model.predict(X_test)
                
                # Calculate probabilities for MAE (if available)
                prob_predictions = (model.predict_proba(X_test) 
                                    if hasattr(model, 'predict_proba') 
                                    else None)
                
                # Calculate all metrics
                metrics_results = {}
                for metric_name, metric_func in self.metrics.items():
                    try:
                        if metric_name == 'mae' and prob_predictions is not None:
                            metrics_results[metric_name] = mean_absolute_error(
                                y_test, prob_predictions[:, 1] if prob_predictions.shape[1] == 2 else prob_predictions
                            )
                        else:
                            metrics_results[metric_name] = metric_func(y_true=y_test, y_pred=predictions)
                    except Exception as e:
                        print(f"Error calculating {metric_name}: {str(e)}")
                        metrics_results[metric_name] = np.nan
                
                self.results.append({
                    'model': model_name,
                    **metrics_results,
                    'instance': model
                })
                model_pbar.update(1)

class ExperimentRunner:
    def __init__(self, config: Dict):
        self.config = config
        
    def run(self, X, y):
        """Run complete experiment pipeline"""
        # Split data - keep original text data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.config['test_size'], random_state=42
        )
        
        # Preprocess data - create transformed versions
        preprocessor = self.config['preprocessor']()
        X_train_transformed = preprocessor.fit_transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)
        
        # Initialize and run trainer with transformed data
        trainer = ModelTrainer(self.config['models'], self.config['metrics'])
        trainer.train_and_evaluate(X_train_transformed, X_test_transformed, y_train, y_test)
        
        # Save models and preprocessor
        if 'save_dir' in self.config:
            save_models(
                trainer.results,
                preprocessor,
                self.config['mlb'],
                self.config['save_dir']
            )
        
        # Return original text data for evaluation
        return trainer.results, X_test, y_test


# -------------------- Utility Functions --------------------
def save_models(results: List[Dict], preprocessor: Any, mlb: MultiLabelBinarizer, save_dir: str):
    """Save trained models and preprocessor to disk"""
    os.makedirs(save_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    try:
        # Save preprocessor and MLB
        joblib.dump(preprocessor, f"{save_dir}/preprocessor_{timestamp}.pkl")
        joblib.dump(mlb, f"{save_dir}/mlb_{timestamp}.pkl")
        
        # Save models
        for result in results:
            model_name = result['model'].replace(' ', '_').lower()
            filename = f"{save_dir}/{model_name}_{timestamp}.pkl"
            joblib.dump(result['instance'], filename)
            
        print(f"\nAll resources saved with timestamp: {timestamp}")
    except Exception as e:
        print(f"Error saving models: {str(e)}")

def print_results(results: List[Dict], sort_by: str = 'subset_accuracy'):
    """Print results with pretty formatting"""
    metric_names = {
        'hamming_loss': 'Hamming Loss',
        'subset_accuracy': 'Accuracy', 
        'micro_f1': 'Micro F1',
        'macro_f1': 'Macro F1',
        'mae': 'MAE'
    }
    
    # Prepare table data
    headers = ["Model"] + list(metric_names.values())
    table_data = []
    
    # sorted_results = sorted(results, key=lambda x: x.get(sort_by, 0), reverse=True)
    
    for res in results:
        row = [res['model']]
        for metric in metric_names.keys():
            value = res.get(metric, np.nan)
            if isinstance(value, float):
                if 'Loss' in metric_names[metric] or 'MAE' in metric_names[metric]:
                    row.append(f"{value:.4f}")
                else:
                    row.append(f"{value:.2%}")
            else:
                row.append("N/A")
        table_data.append(row)
    
    # Print formatted table
    print("\n" + "="*80)
    print("FINAL RESULTS".center(80))
    print("="*80)
    print(tabulate(table_data, headers=headers, tablefmt="grid", stralign="center"))

def load_resources(model_path: str, preprocessor_path: str, mlb_path: str) -> tuple:
    """Load saved model, preprocessor, and label binarizer"""
    try:
        return (
            joblib.load(model_path),
            joblib.load(preprocessor_path),
            joblib.load(mlb_path)
        )
    except Exception as e:
        raise ValueError(f"Error loading resources: {str(e)}")

def evaluate_loaded_model(model: Any, preprocessor: Any, mlb: MultiLabelBinarizer,
                         X_test: pd.Series, y_test: np.ndarray) -> None:
    """Evaluate and print metrics for a loaded model"""
    # Preprocess test data
    X_test_transformed = preprocessor.transform(X_test)
    
    # Get predictions
    predictions = model.predict(X_test_transformed)
    
    # Calculate metrics
    results = {
        'hamming_loss': hamming_loss(y_test, predictions),
        'subset_accuracy': accuracy_score(y_test, predictions),
        'micro_f1': f1_score(y_test, predictions, average='micro'),
        'macro_f1': f1_score(y_test, predictions, average='macro'),
    }
    
    # Handle MAE separately
    try:
        prob_predictions = model.predict_proba(X_test_transformed)
        results['mae'] = mean_absolute_error(
            y_test, prob_predictions[:, 1] if prob_predictions.shape[1] == 2 else prob_predictions
        )
    except AttributeError:
        results['mae'] = np.nan
    
    # Print results
    print_results([{'model': 'Loaded Model', **results}])

def predict_genre(model: Any, preprocessor: Any, mlb: MultiLabelBinarizer,
                 synopsis: str, threshold: float = 0.5) -> dict:
    """Predict genre for a single synopsis"""
    # Preprocess text
    processed = preprocessor.transform([synopsis])
    
    # Make prediction
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(processed)
        prediction = (probabilities >= threshold).astype(int)
    else:
        prediction = model.predict(processed)
        probabilities = None
    
    # Convert to labels
    labels = mlb.inverse_transform(prediction)
    
    return {
        'prediction': labels[0],
        'probabilities': dict(zip(mlb.classes_, probabilities[0])) if probabilities is not None else None
    }

# -------------------- Configuration --------------------

CONFIG = {
    'test_size': 0.2,
    'save_dir': "saved_models",
    'preprocessor': None,
    'models': {
        'Naive Bayes': OneVsRestClassifier(MultinomialNB()),
        'Logistic Reg': OneVsRestClassifier(LogisticRegression(max_iter=1000)),
        'SVM': OneVsRestClassifier(SVC(kernel='linear', probability=True))
    },
    'metrics': {
        'hamming_loss': hamming_loss,
        'subset_accuracy': accuracy_score,
        'micro_f1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='micro'),
        'macro_f1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='macro'),
        'mae': mean_absolute_error
    }
}
# -------------------- Main Execution --------------------
if __name__ == "__main__":
    # Load data
    X, y, mlb = DataLoader.load_data("../data/books_dataset.csv")
    CONFIG['preprocessor'] = Preprocessor
    CONFIG['mlb'] = mlb
    
    # Run experiment
    runner = ExperimentRunner(CONFIG)
    results, X_test_raw, y_test = runner.run(X, y)  # Now contains original text data
    
    # Show comparison
    print_results(results)
    
    # Save test data for later evaluation
    joblib.dump((X_test_raw, y_test), "test_data.pkl")
    print("Training complete. Models and test data saved.")

Training SVM: 100%|██████████| 3/3 [01:28<00:00, 29.48s/it]         



All resources saved with timestamp: 20250126_222502

                                 FINAL RESULTS                                  
+--------------+----------------+------------+------------+------------+--------+
|    Model     |   Hamming Loss |  Accuracy  |  Micro F1  |  Macro F1  |    MAE |
| Naive Bayes  |         0.0017 |   57.37%   |   67.81%   |   3.69%    | 0.0025 |
+--------------+----------------+------------+------------+------------+--------+
| Logistic Reg |         0.0016 |   58.23%   |   69.27%   |   1.90%    | 0.0033 |
+--------------+----------------+------------+------------+------------+--------+
|     SVM      |         0.0008 |   80.40%   |   86.47%   |   33.11%   | 0.0012 |
+--------------+----------------+------------+------------+------------+--------+
Training complete. Models and test data saved.


In [38]:
# -------------------- Inference Cell --------------------
from sklearn.preprocessing import MultiLabelBinarizer
from tabulate import tabulate

def load_resources(timestamp: str = "latest") -> tuple:
    """Load saved model, preprocessor, and label binarizer"""
    try:
        if timestamp == "latest":
            # Find all preprocessor files
            files = [f for f in os.listdir("saved_models") 
                    if f.startswith("preprocessor_") and f.endswith(".pkl")]
            
            if not files:
                raise ValueError("No saved models found")
            
            # Extract full timestamp from filename (preprocessor_YYYYMMDD_HHMMSS.pkl)
            latest_file = sorted(files)[-1]
            timestamp = latest_file.split('_')[1] + '_' + latest_file.split('_')[2].split('.')[0]
            
        # Find matching model file (there might be multiple models)
        model_files = [f for f in os.listdir("saved_models") 
                      if f.endswith(f"{timestamp}.pkl") and not f.startswith(('preprocessor_', 'mlb_'))]
        
        if not model_files:
            raise ValueError(f"No model found for timestamp {timestamp}")
            
        # Load resources
        return (
            joblib.load(f"saved_models/{model_files[0]}"),  # Load first matching model
            joblib.load(f"saved_models/preprocessor_{timestamp}.pkl"),
            joblib.load(f"saved_models/mlb_{timestamp}.pkl")
        )
    except Exception as e:
        raise ValueError(f"Error loading resources: {str(e)}")

def print_evaluation(results: dict):
    """Pretty print evaluation metrics"""
    metrics = {
        'Hamming Loss': results['hamming_loss'],
        'Accuracy': results['subset_accuracy'],
        'Micro F1': results['micro_f1'],
        'Macro F1': results['macro_f1'],
        'MAE': results['mae']
    }
    
    table = [[k, f"{v:.4f}" if isinstance(v, float) else v] for k, v in metrics.items()]
    print(tabulate(table, headers=["Metric", "Value"], tablefmt="grid"))

def evaluate_model(model, preprocessor, mlb):
    """Complete evaluation workflow"""
    try:
        # Load original text data and labels
        X_test_raw, y_test = joblib.load("test_data.pkl")
        
        # Transform using the preprocessor
        X_test_transformed = preprocessor.transform(X_test_raw)
        
        predictions = model.predict(X_test_transformed)
        
        results = {
            'hamming_loss': hamming_loss(y_test, predictions),
            'subset_accuracy': accuracy_score(y_test, predictions),
            'micro_f1': f1_score(y_test, predictions, average='micro'),
            'macro_f1': f1_score(y_test, predictions, average='macro'),
        }
        
        try:
            prob_predictions = model.predict_proba(X_test_transformed)
            results['mae'] = mean_absolute_error(y_test, prob_predictions)
        except AttributeError:
            results['mae'] = "N/A"
            
        print("Model Evaluation Results:")
        print_evaluation(results)
        
    except FileNotFoundError:
        print("Test data not found. Run training cell first.")

def predict_genre(model, preprocessor, mlb, text: str, threshold: float = 0.2):
    """Interactive prediction function"""
    processed = preprocessor.transform([text])
    
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(processed)
        prediction = (probabilities >= threshold).astype(int)
        prob_dict = dict(zip(mlb.classes_, probabilities[0]))
    else:
        prediction = model.predict(processed)
        prob_dict = None
    
    labels = mlb.inverse_transform(prediction)
    
    print("\nPrediction Results:")
    print(f"Input text: {text[:100]}...")
    print(f"Predicted genres: {', '.join(labels[0]) or 'None'}")
    
    if prob_dict:
        print("\nClass Probabilities:")
        for genre, prob in sorted(prob_dict.items(), key=lambda x: x[1], reverse=True):
            print(f"{genre:<25} {prob:.2%}")

if __name__ == "__main__":
    # Load latest model
    model, preprocessor, mlb = load_resources()
    
    # Evaluate
    evaluate_model(model, preprocessor, mlb)
    
    # Make prediction
    sample_text = "A space opera featuring interstellar politics, alien species, and epic battles between galactic empires"
    predict_genre(model, preprocessor, mlb, sample_text)

Model Evaluation Results:
+--------------+---------+
| Metric       |   Value |
| Hamming Loss |  0.0016 |
+--------------+---------+
| Accuracy     |  0.5823 |
+--------------+---------+
| Micro F1     |  0.6927 |
+--------------+---------+
| Macro F1     |  0.019  |
+--------------+---------+
| MAE          |  0.0033 |
+--------------+---------+


AttributeError: 'list' object has no attribute 'astype'