# Model Training

This notebook handles model training using the project's pipeline system.
It includes:
- Loading preprocessed data
- Model training with the training pipeline
- Training visualization
- Model checkpointing

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from datetime import datetime

# Add project root to path
sys.path.append('..')

# Import project modules
from src.pipeline.train_pipeline import TrainPipeline
from src.logger import logger
from src.exception import CustomException

# Configure logging
logger.info("Starting model training notebook")

## 1. Load Preprocessed Data

In [None]:
def load_preprocessed_data(data_dir='artifacts'):
    """
    Load preprocessed training and testing data
    
    Args:
        data_dir (str): Directory containing preprocessed data files
    
    Returns:
        tuple: X_train, y_train, X_test, y_test
    """
    try:
        X_train = np.load(os.path.join(data_dir, 'X_train.npy'))
        y_train = np.load(os.path.join(data_dir, 'y_train.npy'))
        X_test = np.load(os.path.join(data_dir, 'X_test.npy'))
        y_test = np.load(os.path.join(data_dir, 'y_test.npy'))
        
        logger.info("Preprocessed data loaded successfully")
        return X_train, y_train, X_test, y_test
    
    except Exception as e:
        logger.error(f"Failed to load preprocessed data: {e}")
        raise CustomException(e, sys)

# Load data
X_train, y_train, X_test, y_test = load_preprocessed_data()

print("Data shapes:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

## 2. Initialize Training Pipeline

In [None]:
def configure_gpu_memory():
    """
    Configure GPU memory growth to prevent full memory allocation
    """
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print(f"Found {len(gpus)} GPU(s), memory growth enabled")
            logger.info(f"GPU configuration completed: {len(gpus)} GPUs detected")
        except RuntimeError as e:
            print(f"Memory growth setting failed: {str(e)}")
            logger.error(f"GPU memory configuration failed: {e}")

def create_training_run_dir():
    """
    Create a timestamped directory for this training run
    
    Returns:
        str: Path to training run directory
    """
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    run_dir = os.path.join('artifacts', f'training_run_{timestamp}')
    os.makedirs(run_dir, exist_ok=True)
    return run_dir

# Configure GPU
configure_gpu_memory()

# Create training run directory
training_run_dir = create_training_run_dir()

# Initialize training pipeline
trainer = TrainPipeline(
    batch_size=4,  # Adjust based on GPU memory
    log_dir=training_run_dir
)

print(f"Training run directory: {training_run_dir}")

## 3. Train Model

In [None]:
def train_model(trainer, X_train, y_train, X_test, y_test):
    """
    Train the model using the provided training pipeline
    
    Args:
        trainer (TrainPipeline): Training pipeline instance
        X_train, y_train: Training data
        X_test, y_test: Test data
    
    Returns:
        tuple: Model metrics and training history
    """
    try:
        logger.info("Starting model training")
        
        # Start training
        metrics, history = trainer.initiate_training(
            X_train, y_train, 
            X_test, y_test
        )
        
        logger.info("Model training completed successfully")
        return metrics, history
    
    except Exception as e:
        logger.error(f"Model training failed: {e}")
        raise CustomException(e, sys)

# Train the model
metrics, history = train_model(trainer, X_train, y_train, X_test, y_test)

print("\nTraining completed!")
print("\nTest Set Metrics:")
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.4f}")

## 4. Training Visualization

In [None]:
def plot_training_history(history, output_dir):
    """
    Visualize training and validation metrics
    
    Args:
        history (tf.keras.callbacks.History): Training history
        output_dir (str): Directory to save plot
    """
    try:
        plt.figure(figsize=(15, 5))
        
        # Loss plot
        plt.subplot(1, 3, 1)
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title('Model Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend()
        
        # MAE plot
        plt.subplot(1, 3, 2)
        plt.plot(history.history['mae'], label='Training MAE')
        plt.plot(history.history['val_mae'], label='Validation MAE')
        plt.title('Mean Absolute Error')
        plt.ylabel('MAE')
        plt.xlabel('Epoch')
        plt.legend()
        
        # RMSE plot
        plt.subplot(1, 3, 3)
        plt.plot(history.history['rmse'], label='Training RMSE')
        plt.plot(history.history['val_rmse'], label='Validation RMSE')
        plt.title('Root Mean Square Error')
        plt.ylabel('RMSE')
        plt.xlabel('Epoch')
        plt.legend()
        
        plt.tight_layout()
        
        # Save plot
        plot_path = os.path.join(output_dir, 'training_metrics_plot.png')
        plt.savefig(plot_path)
        plt.close()
        
        logger.info(f"Training history plot saved to {plot_path}")
    
    except Exception as e:
        logger.error(f"Failed to plot training history: {e}")
        raise CustomException(e, sys)

# Plot training history
plot_training_history(history, training_run_dir)

## 5. Save Training Results

In [ ]:
def save_training_results(metrics, history, output_dir):
    """
    Save training metrics and history to CSV files
    
    Args:
        metrics (dict): Model evaluation metrics
        history (tf.keras.callbacks.History): Training history
        output_dir (str): Directory to save results
    """
    try:
        # Save training metrics
        metrics_df = pd.DataFrame(metrics, index=[0])
        metrics_path = os.path.join(output_dir, 'training_metrics.csv')
        metrics_df.to_csv(metrics_path, index=False)
        
        # Save training history
        history_df = pd.DataFrame(history.history)
        history_path = os.path.join(output_dir, 'training_history.csv')
        history_df.to_csv(history_path, index=False)
        
        logger.info(f"Training results saved in {output_dir}")
        print(f"Training results saved in {output_dir}")
    
    except Exception as e:
        logger.error(f"Failed to save training results: {e}")
        raise CustomException(e, sys)

# Save training results
save_training_results(metrics, history, training_run_dir)