<a href="https://colab.research.google.com/github/vishnuy/AI-ML/blob/main/TrainMistralSentiAnalysis2Large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch transformers datasets
%pip install peft transformers trl accelerate

Collecting trl
  Downloading trl-0.22.1-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.22.1-py3-none-any.whl (544 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.22.1


In [2]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import os

class MistralFineTuner:
    def __init__(self, model_name="mistralai/Mistral-7B-v0.1", max_length=512):
        """
        Initialize the fine-tuner with Mistral model

        Args:
            model_name: Hugging Face model identifier
            max_length: Maximum sequence length for tokenization
        """
        self.model_name = model_name
        self.max_length = max_length
        self.tokenizer = None
        self.model = None
        self.peft_model = None

    def setup_model_and_tokenizer(self):
        """Load and configure the model and tokenizer"""
        print(f"Loading {self.model_name}...")

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Configure model loading based on available hardware
        model_kwargs = {
            "trust_remote_code": True
        }

        # Check device capabilities
        device_has_gpu = torch.cuda.is_available()

        if device_has_gpu:
            print(f"GPU available: {torch.cuda.get_device_name()}")
            print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
            print("Using GPU acceleration with optimized settings")
            model_kwargs.update({
                "torch_dtype": torch.float16,
                "device_map": "auto"
            })
        else:
            print("No GPU available, using CPU")
            print("Warning: Training will be significantly slower on CPU")
            print("Consider using Google Colab or other GPU platforms for faster training")
            model_kwargs.update({
                "torch_dtype": torch.float32,  # Use float32 for CPU
                "device_map": None # Explicitly set device_map to None for CPU
            })


        # Load model with appropriate settings for fine-tuning
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            **model_kwargs
        )

        # Setup LoRA configuration for efficient fine-tuning
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=16,  # Rank
            lora_alpha=32,  # LoRA scaling parameter
            lora_dropout=0.1,
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
        )

        # Apply LoRA to model
        self.peft_model = get_peft_model(self.model, lora_config)
        print("\nTrainable parameters:")
        self.peft_model.print_trainable_parameters()

    def create_sample_dataset(self):
        """Create a sample dataset for demonstration (sentiment analysis)"""
        data = [
            {"text": "I love this product! It's amazing and works perfectly.", "label": "positive"},
            {"text": "This is the worst purchase I've ever made. Completely disappointed.", "label": "negative"},
            {"text": "The item is okay, nothing special but does the job.", "label": "neutral"},
            {"text": "Fantastic quality! Highly recommend to everyone.", "label": "positive"},
            {"text": "Terrible customer service and poor product quality.", "label": "negative"},
            {"text": "It's an average product, meets basic expectations.", "label": "neutral"},
            {"text": "Outstanding! Exceeded all my expectations.", "label": "positive"},
            {"text": "Waste of money. Doesn't work as advertised.", "label": "negative"},
            {"text": "Decent product for the price point.", "label": "neutral"},
            {"text": "Absolutely perfect! Will buy again.", "label": "positive"},
            {"text": "Poor build quality and arrived damaged.", "label": "negative"},
            {"text": "It's fine, does what it's supposed to do.", "label": "neutral"},
            {"text": "Incredible value for money! Love it!", "label": "positive"},
            {"text": "Horrible experience, would not recommend.", "label": "negative"},
            {"text": "Standard quality, no complaints but nothing exciting.", "label": "neutral"},
        ]

        return pd.DataFrame(data)

    def format_prompt(self, text, label=None):
        """Format input text into a prompt for the model"""
        if label:
            # Training format
            return f"Analyze the sentiment of this text: {text}\nSentiment: {label}"
        else:
            # Inference format
            return f"Analyze the sentiment of this text: {text}\nSentiment:"

    def prepare_dataset(self, df):
        """Prepare dataset for training"""
        # Format prompts
        formatted_data = []
        for _, row in df.iterrows():
            prompt = self.format_prompt(row['text'], row['label'])
            formatted_data.append({"text": prompt})

        # Create HuggingFace dataset
        dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))

        # Tokenize dataset
        def tokenize_function(examples):
            tokenized = self.tokenizer(
                examples["text"],
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt"
            )
            tokenized["labels"] = tokenized["input_ids"].clone()
            return tokenized

        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=dataset.column_names
        )

        return tokenized_dataset

    def train(self, train_dataset, eval_dataset=None, output_dir="./mistral-finetuned"):
        """Fine-tune the model"""
        # Check device capabilities
        device_has_gpu = torch.cuda.is_available()

        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=1,  # Small batch size for limited resources
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_steps=100,
            learning_rate=5e-5,
            fp16=device_has_gpu,  # Only use fp16 if GPU is available
            logging_steps=10,
            save_strategy="epoch",
            evaluation_strategy="epoch" if eval_dataset else "no",
            load_best_model_at_end=True if eval_dataset else False,
            metric_for_best_model="eval_loss" if eval_dataset else None,
            report_to=None,  # Disable wandb/tensorboard logging
            dataloader_pin_memory=device_has_gpu,  # Only pin memory if GPU available
            remove_unused_columns=False,  # Prevent column removal issues
        )

        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,  # Causal LM, not masked LM
        )

        # Create trainer
        trainer = Trainer(
            model=self.peft_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
        )

        # Start training
        print("Starting training...")
        trainer.train()

        # Save the fine-tuned model
        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)

        print(f"Model saved to {output_dir}")

    def inference(self, text, max_new_tokens=50):
        """Run inference on the fine-tuned model"""
        prompt = self.format_prompt(text)

        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_length
        )

        # Ensure model is on the correct device for inference
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
            )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract just the generated part
        generated_text = response[len(prompt):].strip()
        return generated_text

def main():
    # Initialize fine-tuner
    fine_tuner = MistralFineTuner()

    # Setup model and tokenizer
    fine_tuner.setup_model_and_tokenizer()

    # Create or load your dataset
    df = fine_tuner.create_sample_dataset()
    print(f"Dataset size: {len(df)}")
    print("\nSample data:")
    print(df.head())

    # Split data for training and validation
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

    # Prepare datasets
    train_dataset = fine_tuner.prepare_dataset(train_df)
    eval_dataset = fine_tuner.prepare_dataset(eval_df)

    # Fine-tune the model
    fine_tuner.train(train_dataset, eval_dataset)

    # Test inference
    test_texts = [
        "This product is absolutely wonderful!",
        "I hate this thing, it's broken.",
        "It's an okay product, nothing special."
    ]

    print("\n" + "="*50)
    print("TESTING FINE-TUNED MODEL")
    print("="*50)

    for text in test_texts:
        result = fine_tuner.inference(text)
        print(f"\nInput: {text}")
        print(f"Predicted sentiment: {result}")

# Additional utility functions for custom datasets

def load_custom_dataset(file_path, text_column, label_column):
    """Load custom dataset from CSV file"""
    df = pd.read_csv(file_path)
    return df[[text_column, label_column]].rename(
        columns={text_column: 'text', label_column: 'label'}
    )

def prepare_instruction_dataset(data, instruction_template=None):
    """Prepare dataset in instruction-following format"""
    if instruction_template is None:
        instruction_template = "Complete the following task: {instruction}\n\nInput: {input}\n\nOutput: {output}"

    formatted_data = []
    for item in data:
        formatted_text = instruction_template.format(
            instruction=item.get('instruction', ''),
            input=item.get('input', ''),
            output=item.get('output', '')
        )
        formatted_data.append({"text": formatted_text})

    return formatted_data

if __name__ == "__main__":
    # Check if GPU is available and configure accordingly
    if torch.cuda.is_available():
        print(f"GPU available: {torch.cuda.get_device_name()}")
        print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        print("Using GPU acceleration with optimized settings")
    else:
        print("No GPU available, using CPU")
        print("Warning: Training will be significantly slower on CPU")
        print("Consider using Google Colab or other GPU platforms for faster training")

    # Additional memory optimization for CPU-only setups
    if not torch.cuda.is_available():
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
        torch.set_num_threads(min(4, torch.get_num_threads()))  # Limit CPU threads

    # Run the main fine-tuning process
    main()

GPU available: Tesla T4
GPU memory: 15.8 GB
Using GPU acceleration with optimized settings
Loading mistralai/Mistral-7B-v0.1...


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

GPU available: Tesla T4
GPU memory: 15.8 GB
Using GPU acceleration with optimized settings


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]




Trainable parameters:
trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879
Dataset size: 15

Sample data:
                                                text     label
0  I love this product! It's amazing and works pe...  positive
1  This is the worst purchase I've ever made. Com...  negative
2  The item is okay, nothing special but does the...   neutral
3   Fantastic quality! Highly recommend to everyone.  positive
4  Terrible customer service and poor product qua...  negative




Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'