<a href="https://colab.research.google.com/github/vishnuy/AI-ML/blob/main/TrainMistralSentiAnalysisSmall4bit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch transformers datasets
%pip install peft transformers trl accelerate
!pip install -U bitsandbytes



In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import os

class MistralFineTuner:
    def __init__(self, model_name="mistralai/Mistral-7B-v0.1", max_length=256, use_4bit=True):
        """
        Initialize the fine-tuner with Mistral model

        Args:
            model_name: Hugging Face model identifier
            max_length: Maximum sequence length for tokenization (reduced for memory)
            use_4bit: Use 4-bit quantization to reduce memory usage
        """
        self.model_name = model_name
        self.max_length = max_length
        self.use_4bit = use_4bit
        self.tokenizer = None
        self.model = None
        self.peft_model = None

    def setup_model_and_tokenizer(self):
        """Load and configure the model and tokenizer with memory optimizations"""
        print(f"Loading {self.model_name} with memory optimizations...")

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Configure model loading based on available hardware
        model_kwargs = {
            "trust_remote_code": True,
            "low_cpu_mem_usage": True,  # Reduce CPU memory usage during loading
        }

        if torch.cuda.is_available():
            if self.use_4bit:
                # 4-bit quantization for maximum memory savings
                from transformers import BitsAndBytesConfig
                bnb_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.bfloat16
                )
                model_kwargs.update({
                    "quantization_config": bnb_config,
                    "device_map": "auto"
                })
                print("Using 4-bit quantization for memory efficiency")
            else:
                model_kwargs.update({
                    "torch_dtype": torch.float16,
                    "device_map": "auto"
                })
        else:
            model_kwargs.update({
                "torch_dtype": torch.float32,
                "device_map": None
            })

        # Load model with memory-efficient settings
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            **model_kwargs
        )

        # Enable gradient checkpointing to trade compute for memory
        self.model.gradient_checkpointing_enable()

        # Setup LoRA configuration for efficient fine-tuning
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=8,  # Reduced rank for less memory usage
            lora_alpha=16,  # Proportionally reduced
            lora_dropout=0.1,
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
            bias="none",
        )

        # Apply LoRA to model
        self.peft_model = get_peft_model(self.model, lora_config)
        print("\nTrainable parameters:")
        self.peft_model.print_trainable_parameters()

        # Clear cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    def create_sample_dataset(self):
        """Create a sample dataset for demonstration (sentiment analysis)"""
        data = [
            {"text": "I love this product! It's amazing and works perfectly.", "label": "positive"},
            {"text": "This is the worst purchase I've ever made. Completely disappointed.", "label": "negative"},
            {"text": "The item is okay, nothing special but does the job.", "label": "neutral"},
            {"text": "Fantastic quality! Highly recommend to everyone.", "label": "positive"},
            {"text": "Terrible customer service and poor product quality.", "label": "negative"},
            {"text": "It's an average product, meets basic expectations.", "label": "neutral"},
            {"text": "Outstanding! Exceeded all my expectations.", "label": "positive"},
            {"text": "Waste of money. Doesn't work as advertised.", "label": "negative"},
            {"text": "Decent product for the price point.", "label": "neutral"},
            {"text": "Absolutely perfect! Will buy again.", "label": "positive"},
            {"text": "Poor build quality and arrived damaged.", "label": "negative"},
            {"text": "It's fine, does what it's supposed to do.", "label": "neutral"},
            {"text": "Incredible value for money! Love it!", "label": "positive"},
            {"text": "Horrible experience, would not recommend.", "label": "negative"},
            {"text": "Standard quality, no complaints but nothing exciting.", "label": "neutral"},
        ]

        return pd.DataFrame(data)

    def format_prompt(self, text, label=None):
        """Format input text into a prompt for the model"""
        if label:
            # Training format
            return f"Analyze the sentiment of this text: {text}\nSentiment: {label}"
        else:
            # Inference format
            return f"Analyze the sentiment of this text: {text}\nSentiment:"

    def prepare_dataset(self, df):
        """Prepare dataset for training"""
        # Format prompts
        formatted_data = []
        for _, row in df.iterrows():
            prompt = self.format_prompt(row['text'], row['label'])
            formatted_data.append({"text": prompt})

        # Create HuggingFace dataset
        dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))

        # Tokenize dataset
        def tokenize_function(examples):
            tokenized = self.tokenizer(
                examples["text"],
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt"
            )
            tokenized["labels"] = tokenized["input_ids"].copy()
            return tokenized

        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=dataset.column_names
        )

        return tokenized_dataset

    def train(self, train_dataset, eval_dataset=None, output_dir="./mistral-finetuned"):
        """Fine-tune the model with memory-efficient settings"""
        # Check device capabilities
        device_has_gpu = torch.cuda.is_available()

        # Memory-optimized training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=2,  # Reduced epochs for faster completion
            per_device_train_batch_size=1,  # Keep batch size at 1
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=8,  # Increased to simulate larger batch
            warmup_steps=50,  # Reduced warmup steps
            learning_rate=2e-4,  # Slightly higher LR for fewer epochs
            fp16=device_has_gpu,  # Only use fp16 if GPU is available
            logging_steps=5,
            save_strategy="epoch",
            evaluation_strategy="epoch" if eval_dataset else "no",
            load_best_model_at_end=False,  # Disable to save memory
            report_to=None,  # Disable wandb/tensorboard logging
            dataloader_pin_memory=device_has_gpu,  # Only pin memory if GPU available
            dataloader_num_workers=0,  # Disable multiprocessing to save memory
            remove_unused_columns=False,
            gradient_checkpointing=True,  # Enable gradient checkpointing
            optim="adamw_torch",  # Use memory-efficient optimizer
            max_grad_norm=1.0,  # Gradient clipping
            save_total_limit=1,  # Only keep latest checkpoint
        )

        # Memory-efficient data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
            pad_to_multiple_of=8,  # Efficient padding
        )

        # Create trainer with memory optimizations
        trainer = Trainer(
            model=self.peft_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
        )

        # Clear cache before training
        if device_has_gpu:
            torch.cuda.empty_cache()

        # Start training with error handling
        print("Starting memory-efficient training...")
        try:
            trainer.train()
        except RuntimeError as e:
            if "out of memory" in str(e).lower():
                print("\n❌ OUT OF MEMORY ERROR!")
                print("Try these solutions:")
                print("1. Reduce max_length further (current: {})".format(self.max_length))
                print("2. Use a smaller model (e.g., 'microsoft/DialoGPT-small')")
                print("3. Enable 4-bit quantization if not already used")
                print("4. Use Google Colab with GPU")
                raise e
            else:
                raise e

        # Save the fine-tuned model
        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)

        print(f"Model saved to {output_dir}")

        # Clear cache after training
        if device_has_gpu:
            torch.cuda.empty_cache()

    def inference(self, text, max_new_tokens=50):
        """Run inference on the fine-tuned model"""
        prompt = self.format_prompt(text)

        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_length
        )

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
            )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract just the generated part
        generated_text = response[len(prompt):].strip()
        return generated_text

def main():
    # Initialize fine-tuner
    fine_tuner = MistralFineTuner()

    # Setup model and tokenizer
    fine_tuner.setup_model_and_tokenizer()

    # Create or load your dataset
    df = fine_tuner.create_sample_dataset()
    print(f"Dataset size: {len(df)}")
    print("\nSample data:")
    print(df.head())

    # Split data for training and validation
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

    # Prepare datasets
    train_dataset = fine_tuner.prepare_dataset(train_df)
    eval_dataset = fine_tuner.prepare_dataset(eval_df)

    # Fine-tune the model
    fine_tuner.train(train_dataset, eval_dataset)

    # Test inference
    test_texts = [
        "This product is absolutely wonderful!",
        "I hate this thing, it's broken.",
        "It's an okay product, nothing special."
    ]

    print("\n" + "="*50)
    print("TESTING FINE-TUNED MODEL")
    print("="*50)

    for text in test_texts:
        result = fine_tuner.inference(text)
        print(f"\nInput: {text}")
        print(f"Predicted sentiment: {result}")

# Additional utility functions for custom datasets

def load_custom_dataset(file_path, text_column, label_column):
    """Load custom dataset from CSV file"""
    df = pd.read_csv(file_path)
    return df[[text_column, label_column]].rename(
        columns={text_column: 'text', label_column: 'label'}
    )

def prepare_instruction_dataset(data, instruction_template=None):
    """Prepare dataset in instruction-following format"""
    if instruction_template is None:
        instruction_template = "Complete the following task: {instruction}\n\nInput: {input}\n\nOutput: {output}"

    formatted_data = []
    for item in data:
        formatted_text = instruction_template.format(
            instruction=item.get('instruction', ''),
            input=item.get('input', ''),
            output=item.get('output', '')
        )
        formatted_data.append({"text": formatted_text})

    return formatted_data

if __name__ == "__main__":
    # Check if GPU is available and configure accordingly
    if torch.cuda.is_available():
        print(f"GPU available: {torch.cuda.get_device_name()}")
        print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        print("Using GPU acceleration with optimized settings")
    else:
        print("No GPU available, using CPU")
        print("Warning: Training will be significantly slower on CPU")
        print("Consider using Google Colab or other GPU platforms for faster training")

    # Additional memory optimization for CPU-only setups
    if not torch.cuda.is_available():
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
        torch.set_num_threads(min(4, torch.get_num_threads()))  # Limit CPU threads

    # Run the main fine-tuning process
    main()

GPU available: Tesla T4
GPU memory: 15.8 GB
Using GPU acceleration with optimized settings
Loading mistralai/Mistral-7B-v0.1 with memory optimizations...
Using 4-bit quantization for memory efficiency


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Trainable parameters:
trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940
Dataset size: 15

Sample data:
                                                text     label
0  I love this product! It's amazing and works pe...  positive
1  This is the worst purchase I've ever made. Com...  negative
2  The item is okay, nothing special but does the...   neutral
3   Fantastic quality! Highly recommend to everyone.  positive
4  Terrible customer service and poor product qua...  negative
