In [1]:
#Login into my huggingface account in notebook
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [8]:
import torch
from transformers import (
    BlenderbotTokenizer, 
    BlenderbotForConditionalGeneration,
    Trainer, 
    TrainingArguments,
    BitsAndBytesConfig 
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType
)
from datasets import load_dataset
import logging
from pathlib import Path
from huggingface_hub import notebook_login, HfApi
import torch
from torch.utils.tensorboard import SummaryWriter
import bitsandbytes as bnb
from datetime import datetime
torch.cuda.empty_cache()  # Add this at the start of your script

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)



def prepare_dataset(data_dir: str):
    """Load and prepare the dataset for training"""
    # Load the processed datasets
    dataset = load_dataset(
        'json', 
        data_files={
            'train': f'{data_dir}/train.json',
            'validation': f'{data_dir}/val.json'
        }
    )
    
    return dataset

def prepare_qlora_model(model_name: str):
    """Prepare model for QLoRA training with better memory handling"""
    try:
        # Define quantization config with more conservative settings
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_quant_storage=torch.float32  # More stable storage type
        )
        
        # Load model with gradient checkpointing
        model = BlenderbotForConditionalGeneration.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            torch_dtype=torch.float16,
            use_cache=False  # Disable KV cache for training
        )
        
        # Manually move model to appropriate device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        
        # Enable gradient checkpointing for memory efficiency
        model.gradient_checkpointing_enable()
        
        # Prepare model for training with memory optimizations
        model = prepare_model_for_kbit_training(
            model,
            use_gradient_checkpointing=True
        )
        
        # More conservative LoRA config
        lora_config = LoraConfig(
            r=8,  # Reduced rank
            lora_alpha=16,
            target_modules=["q_proj", "k_proj"],  # Focus on key modules
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.CAUSAL_LM
        )
        
        model = get_peft_model(model, lora_config)
        
        # Print trainable parameters
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        logger.info(f"Trainable parameters: {trainable_params:,}")
        
        return model
        
    except Exception as e:
        logger.error(f"Error in model preparation: {e}")
        raise

def setup_tensorboard(output_dir: str):
    """Setup TensorBoard logging"""
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    return SummaryWriter(f'{output_dir}/runs/{current_time}')

def train_model(
    model_name: str = "facebook/blenderbot-400M-distill",
    data_dir: str = "/home/zahemen/projects/dl-lib/DocAnalyzerAI/finetune_data",
    output_dir: str = "../models/financial_chatbot",
    num_epochs: int = 3,
    batch_size: int = 1,  # Reduced batch size
    gradient_accumulation_steps: int = 8,  # Increased gradient accumulation
    push_to_hub: bool = True,
    hub_model_id: str = None
):
    """Fine-tune BlenderBot with memory optimizations"""
    try:
        # Clear CUDA cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        # Initialize tensorboard
        writer = setup_tensorboard(output_dir)
        
        # Load tokenizer and model
        tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
        model = prepare_qlora_model(model_name)
        
        # Load dataset
        dataset = prepare_dataset(data_dir)
        
        def preprocess_function(examples):
            # Properly format inputs for Blenderbot
            inputs = []
            for ctx, msg in zip(examples['additional_context'], examples['free_messages']):
                # Combine context and message with proper separator
                combined = f"{ctx} <sep> {msg[0]}" if isinstance(msg, list) else f"{ctx} <sep> {msg}"
                inputs.append(combined)
            
            # Format targets
            targets = []
            for msg in examples['guided_messages']:
                # Handle both list and string inputs
                target = msg[0] if isinstance(msg, list) else msg
                targets.append(target)
            
            # Tokenize inputs
            model_inputs = tokenizer(
                inputs,
                max_length=512,
                padding='max_length',
                truncation=True
            )
            
            # Tokenize targets
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(
                    targets,
                    max_length=512,
                    padding='max_length',
                    truncation=True
                )
            
            model_inputs['labels'] = labels['input_ids']
            
            # Ensure all tensors are on the correct device
            for k, v in model_inputs.items():
                if isinstance(v, torch.Tensor):
                    model_inputs[k] = v.to(model.device)
            
            return model_inputs
        
        # Process datasets with new preprocessing function
        processed_datasets = dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=dataset['train'].column_names,
            desc="Processing dataset"
        )
        
        # Modified training arguments for stability
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            evaluation_strategy="steps",
            eval_steps=100,  # More frequent evaluation
            save_strategy="steps",
            save_steps=100,
            load_best_model_at_end=True,
            logging_dir=f"{output_dir}/logs",
            logging_steps=10,
            report_to=["tensorboard"],
            push_to_hub=push_to_hub,
            hub_model_id=hub_model_id,
            # Added stability settings
            fp16=True,
            optim="adamw_torch",
            learning_rate=2e-5,
            warmup_ratio=0.1,
            max_grad_norm=0.3,
            gradient_checkpointing=True
        )
        
        class CustomTrainer(Trainer):
            def log(self, logs: dict) -> None:
                """Custom logging to TensorBoard"""
                super().log(logs)
                if self.state.global_step % self.args.logging_steps == 0:
                    for key, value in logs.items():
                        writer.add_scalar(key, value, self.state.global_step)
        
        # Initialize trainer
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=processed_datasets['train'],
            eval_dataset=processed_datasets['validation'],
        )
        
        # Login to HuggingFace if pushing to hub
        if push_to_hub:
            notebook_login()
        
        # Train model
        logger.info("Starting training...")
        trainer.train()
        
        # Save final model and tokenizer
        trainer.save_model()
        tokenizer.save_pretrained(output_dir)
        
        # Push to HuggingFace Hub if specified
        if push_to_hub and hub_model_id:
            logger.info(f"Pushing model to HuggingFace Hub: {hub_model_id}")
            api = HfApi()
            api.upload_folder(
                folder_path=output_dir,
                repo_id=hub_model_id,
                repo_type="model"
            )
        
        # Close tensorboard writer
        writer.close()
        logger.info(f"Model saved to {output_dir}")
        
    except Exception as e:
        logger.error(f"Training error: {e}")
        raise



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [11]:
def main():
    try:
        # Verify CUDA availability
        if torch.cuda.is_available():
            logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
            logger.info(f"Available memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        else:
            logger.warning("No GPU available, using CPU")
        
        hub_model_id = "zahemen9900/blenderbot-400m-financial"
        
        train_model(
            push_to_hub=True,
            hub_model_id=hub_model_id,
            num_epochs=5,
            batch_size=1,  # Start with batch size 1
            gradient_accumulation_steps=8
        )
    except Exception as e:
        logger.error(f"Training failed: {e}")
        raise

if __name__ == "__main__":
    import os
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Add this line for debugging
    main()


2024-12-25 15:06:09,752 - __main__ - INFO - Using GPU: NVIDIA GeForce RTX 3050 Laptop GPU
2024-12-25 15:06:09,753 - __main__ - INFO - Available memory: 4.29 GB
2024-12-25 15:06:09,755 - __main__ - ERROR - Training error: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

2024-12-25 15:06:09,756 - __main__ - ERROR - Training failed: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
