In [1]:
!python main_mistral_injection.py > main_misral_injection.out 2>&1

In [1]:
print("Hello")

Hello


In [2]:
import os 
os.listdir()

['requirements.txt',
 'wandb',
 'eval_results_injection',
 'test.ipynb',
 'eval_results_injection_sft_not_sure',
 'dataset',
 'qwen_chartqa.py',
 'runner',
 '__init__.py',
 'main_mistral_sft_injection.py',
 'vlm_adapter_training.zip',
 'config',
 'adapter',
 'main_GQA.py',
 'eval_results_injection_1',
 'test_sc.py',
 'new.ipynb',
 'eval_results_injection_22_05_2025_b',
 '.git',
 'train_and_eval_mistral_dct.py',
 'main_mistral_injection.py',
 'eval_results_injection_22_05_2025',
 'questions',
 'models',
 'main.py',
 '.gitignore',
 'main_chartQA.py',
 'evaluator',
 'eval_results_injection_2_new']

In [3]:
import argparse
import json
import os
import yaml
import logging
import re
from typing import Dict, Any, List, Optional

import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Assuming evaluator scripts are in a subdirectory or accessible in PYTHONPATH
# If they are in 'evaluator' subdirectory:
from evaluator.generic_evaluator import GenericLLMEvaluator
# from evaluator.cascade_evaluator import CascadeEvaluator # If needed

# Import for Adapter
from adapter.mistral_adapter import DCTAdapter # Assuming DCTAdapter is in adapter/my_adapter.py
from runner.train import train_model, freeze_model_except_adapters, train_model_mistral

try:
    from mmengine.config import ConfigDict
except ImportError:
    logging.warning("mmengine.config.ConfigDict not found. Using a basic dict wrapper.")
    class ConfigDict(dict):
        def __getattr__(self, name):
            try:
                return self[name]
            except KeyError:
                raise AttributeError(name)
        def __setattr__(self, name, value):
            self[name] = value

# Setup basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Adapter Helper Functions (from main_chartQA.py) ---
def get_parent_module(model: torch.nn.Module, name: str) -> torch.nn.Module:
    names = name.split('.')
    parent = model
    for n in names[:-1]:
        parent = getattr(parent, n)
    return parent

def inject_adapters(
    model: torch.nn.Module,
    adapter_cls: type,
    base_adapter_args: dict, # Renamed from adapter_args for clarity
    layers_config: List[Dict[str, str]] # Expected format: [{'name': 'layer_name_pattern_to_match'}]
) -> torch.nn.Module:
    logger.info(f"Starting adapter injection with {adapter_cls.__name__}...")
    for name, module in model.named_modules(): # Iterate over all module names in the model
        for layer_conf in layers_config:
            # Check if the current module's name matches the configuration name
            # The original code used 'in', which allows pattern matching.
            # If exact names are always provided in layer_conf, '==' could be used.
            # Sticking to 'in' to maintain original flexibility if patterns are used.
            if layer_conf['name'] in name:
                # Ensure we are matching the exact module intended, not a submodule containing the name
                # This check assumes layer_conf['name'] is the full name of the target module
                if name == layer_conf['name']:
                    logger.info(f"Matched target layer for injection: {name}")
                    try:
                        parent = get_parent_module(model, name)
                        original_module = getattr(parent, name.split('.')[-1])
                        
                        current_adapter_args = base_adapter_args.copy()

                        if hasattr(original_module, 'out_features') and isinstance(getattr(original_module, 'out_features'), int):
                            actual_in_features = original_module.out_features
                            logger.info(f"Dynamically setting adapter 'in_features' for {name} to {actual_in_features} (derived from original_module.out_features).")
                            current_adapter_args['in_features'] = actual_in_features
                        else:
                            logger.warning(f"Original module {name} (type: {type(original_module)}) does not have an integer 'out_features' attribute. "
                                           f"Using 'in_features' from base_adapter_args: {current_adapter_args.get('in_features')}. "
                                           f"This might lead to errors if incorrect for this layer.")

                        adapter_instance = adapter_cls(**current_adapter_args)
                        setattr(parent, name.split('.')[-1], torch.nn.Sequential(original_module, adapter_instance))
                        logger.info(f"Successfully injected adapter after {name} with args: {current_adapter_args}")
                    except Exception as e:
                        logger.error(f"Failed to inject adapter into {name}: {e}", exc_info=True)
    return model



# --- Helper Functions (adapted from evaluation.py) ---

def load_openai_config(config_path: str) -> dict:
    if os.path.exists(config_path):
        with open(config_path, 'r') as f:
            return yaml.safe_load(f)
    logger.warning(f"OpenAI config file not found at {config_path}")
    return {}

def simple_text_summarizer_postprocessor(judge_response_text: str) -> Dict[str, Any]:
    """Postprocessor to extract a score from judge response text using regex."""
    score = None
    lines = judge_response_text.strip().split('\n')
    score_keyword_pattern = r"(?:score|评[分价测]|得分)[:：]?\s*(\d+(?:\.\d+)?)(?:/\d+)?"
    standalone_score_pattern = r"(?<![a-zA-Z0-9\._-])(\b\d+(?:\.\d+)?\b)(?![a-zA-Z0-9\._-])"

    for line in reversed(lines):
        line_cleaned = line.strip()
        if not line_cleaned:
            continue
        match = re.search(score_keyword_pattern, line_cleaned, re.IGNORECASE)
        if match:
            score_str = match.group(1)
            if score_str:
                try:
                    score = float(score_str)
                    break
                except ValueError:
                    logger.warning(f"Found score-like text \"{score_str}\" with keyword but failed to parse as float.")
                    pass
        if score is not None:
            break
        if re.fullmatch(r"\d+(?:\.\d+)?", line_cleaned):
            try:
                potential_score = float(line_cleaned)
                if 0 <= potential_score <= 10:
                    score = potential_score
                    break
            except ValueError:
                pass
        if score is not None:
            break
        else:
            all_standalone_matches = list(re.finditer(standalone_score_pattern, line_cleaned))
            if all_standalone_matches:
                last_match_str = all_standalone_matches[-1].group(1)
                try:
                    potential_score = float(last_match_str)
                    if 0 <= potential_score <= 10:
                        score = potential_score
                        break
                except ValueError:
                    pass
        if score is not None:
            break
    return {"score": score, "raw_judge_response": judge_response_text}

def generate_predictions(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    dataset_split: Dataset,
    device: str,
    max_new_tokens: int = 512
) -> List[Dict[str, Any]]:
    logger.info(f"Generating predictions for {len(dataset_split)} samples...")
    predictions_data = []
    for example in tqdm(dataset_split, desc="Generating Predictions"):
        conv_id = example.get("id", "unknown_id")
        # Assuming dataset has 'query' and 'reference' (optional)
        # For mtbench, 'history' is used. We need the last user turn as query.
        history = example.get("history")
        if not history or not isinstance(history, list) or not history[-1].get("user"):
            current_prompt_text = example.get("query", "") # Fallback if history is not as expected
            if not current_prompt_text:
                 logger.warning(f"Skipping item {conv_id} due to missing user prompt in history or query field.")
                 predictions_data.append({
                    "id": conv_id, "task_category": example.get("task_category", "N/A"),
                    "model_input": "Error: Missing prompt", "prediction": "Error: Missing prompt",
                    "reference_answer": example.get("reference", "N/A"), "full_history": history
                 })
                 continue
        else:
            current_prompt_text = history[-1]["user"]

        reference_answer = history[-1].get("bot", example.get("reference", "N/A"))
        
        model_input_text = current_prompt_text

        try:
            inputs = tokenizer(model_input_text, return_tensors="pt", truncation=True, max_length=2048).to(device) # Added truncation
            with torch.no_grad():
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=True, # Consistent with evaluation.py
                    pad_token_id=tokenizer.eos_token_id # Add pad_token_id for open-ended generation
                )
            # Ensure decoding handles cases where input is part of the output
            # For instruct models, often the prompt is not repeated.
            # If prompt is repeated, use: result = tokenizer.decode(generated_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
            result = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            # A simple way to remove prompt if it's there, more robust methods might be needed
            if result.startswith(model_input_text):
                parsed_answer = result[len(model_input_text):].strip()
            else:
                parsed_answer = result.strip()

            predictions_data.append({
                "id": conv_id,
                "task_category": example.get("task_category", "N/A"),
                "model_input": model_input_text,
                "prediction": parsed_answer,
                "reference_answer": reference_answer,
                "full_history": history
            })
        except Exception as e:
            logger.warning(f"Error generating prediction for ID {conv_id}: {e}")
            predictions_data.append({
                "id": conv_id, "task_category": example.get("task_category", "N/A"),
                "model_input": model_input_text, "prediction": f"Error: {e}",
                "reference_answer": reference_answer, "full_history": history
            })
    return predictions_data

def run_evaluation_pipeline(
    model_name_or_path: str,
    model_to_evaluate: AutoModelForCausalLM, # Pass the loaded model
    tokenizer: AutoTokenizer, # Pass the loaded tokenizer
    eval_dataset: Dataset,
    args: argparse.Namespace,
    output_suffix: str = ""
) -> Optional[float]:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_to_evaluate.to(device)
    model_to_evaluate.eval()

    # 1. Generate Predictions
    predictions_output_filename = f"predictions_{output_suffix}.jsonl"
    predictions_output_filepath = os.path.join(args.eval_output_dir, predictions_output_filename)
    
    generated_preds_list = generate_predictions(model_to_evaluate, tokenizer, eval_dataset, device, args.max_new_tokens)
    
    with open(predictions_output_filepath, 'w') as f:
        for item in generated_preds_list:
            f.write(json.dumps(item) + '\n')
    logger.info(f"Predictions for {output_suffix} saved to {predictions_output_filepath}")

    # 2. Prepare dataset for evaluator
    dataset_for_eval_dict = {
        "query": [], "prediction": [], "reference": [], "id": [], "task_category": []
    }
    valid_predictions_count = 0
    for item in generated_preds_list:
        if not item["prediction"].startswith("Error:"):
            dataset_for_eval_dict["query"].append(item["model_input"])
            dataset_for_eval_dict["prediction"].append(item["prediction"])
            ref = item["reference_answer"]
            if isinstance(ref, list): # Ensure reference is a string
                ref = " ".join(r for r in ref if isinstance(r, str)) if all(isinstance(r, str) for r in ref) else str(ref)
            dataset_for_eval_dict["reference"].append(ref)
            dataset_for_eval_dict["id"].append(item["id"])
            dataset_for_eval_dict["task_category"].append(item.get("task_category", "N/A"))
            valid_predictions_count += 1
    
    if valid_predictions_count == 0:
        logger.warning(f"No successful predictions to evaluate for {output_suffix}. Skipping evaluation.")
        final_score_value = "N/A (No valid predictions)"
    else:
        eval_hf_dataset = Dataset.from_dict(dataset_for_eval_dict)
        logger.info(f"Prepared {len(eval_hf_dataset)} samples for the evaluator for {output_suffix}.")

        # 3. Configure and Run Evaluator
        openai_params = load_openai_config(args.openai_config_path)
        judge_cfg_dict = {
            "model": args.judge_model_name,
            "key": openai_params.get("api_key"),
            "openai_api_base": openai_params.get("base_url"),
            "temperature": 0.0, "max_out_len": 1024, "query_per_second": 1,
            "system_prompt_content": args.judge_system_prompt
        }
        prompt_template_dict = {
            "template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. "
                         "Your evaluation should consider factors such as helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. "
                         "Begin your evaluation by providing a short explanation. Be as objective as possible. "
                         "After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly outputting a single line with only the score. "
                         "Do not output any other text after the score. "
                         "\n\n[Question]\n{query}\n\n[The Start of Assistant's Answer]\n{prediction}\n[The End of Assistant's Answer]"
                         "\n\n[Reference Answer (if available)]\n{reference}\n[The End of Reference Answer]",
            "input_columns": ["query", "prediction", "reference"],
        }
        evaluator_results_filename = f"{args.evaluator_type}_results_{output_suffix}.json"
        evaluator_output_path = os.path.join(args.eval_output_dir, evaluator_results_filename)

        judge_config = ConfigDict(judge_cfg_dict)
        prompt_template_config = ConfigDict(prompt_template_dict)

        evaluator = GenericLLMEvaluator(
            judge_cfg=judge_config,
            prompt_template=prompt_template_config,
            dict_postprocessor=simple_text_summarizer_postprocessor,
            output_path=evaluator_output_path
        )

        logger.info(f"Running evaluation with {args.evaluator_type} for {output_suffix}...")
        evaluation_results = evaluator.score(
            predictions=list(eval_hf_dataset["prediction"]),
            test_set=eval_hf_dataset
        )
        logger.info(f"Raw Evaluation Results for {output_suffix}:")
        try:
            logger.info(json.dumps(evaluation_results, indent=4))
        except TypeError:
            logger.info(str(evaluation_results))

        final_score_value = "N/A"
        if isinstance(evaluation_results, dict) and "average_score" in evaluation_results:
            final_score_value = evaluation_results["average_score"]
            num_scored = evaluation_results.get('num_scored', 'N/A')
            logger.info(f"Average Judge Score for {output_suffix}: {final_score_value:.2f} (Scored items: {num_scored})")
        else:
            logger.warning(f"Could not determine average_score from evaluation results for {output_suffix}.")


    # 4. Save Score File
    score_file_name = f"{model_name_or_path.replace('/', '_')}_{output_suffix}_score.txt"
    score_file_path = os.path.join(args.eval_output_dir, score_file_name)
    try:
        with open(score_file_path, 'w') as f:
            f.write(f"Model: {model_name_or_path} ({output_suffix})\n")
            f.write(f"Training Dataset: {args.train_dataset_path}\n")
            f.write(f"Evaluation Dataset: {args.eval_dataset_path}\n")
            f.write(f"Evaluator: {args.evaluator_type}\n")
            f.write(f"Judge Model: {args.judge_model_name}\n")
            f.write(f"Evaluation Split Size: {len(eval_dataset) if eval_dataset is not None else 'N/A'}\n")
            f.write(f"Final Score: {final_score_value}\n")
        logger.info(f"Evaluation score for {output_suffix} saved to {score_file_path}")
    except Exception as e:
        logger.error(f"Failed to write score to file {score_file_path}: {e}", exc_info=True)
    
    if isinstance(final_score_value, (float, int)):
        return float(final_score_value)
    return None

# --- Main Script Logic ---


In [4]:
args = ConfigDict()
args.model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.2"
args.train_dataset_path = "evaluator/benchmark_datasets/new_datasamples.jsonl"
args.eval_dataset_path = "evaluator/benchmark_datasets/mtbench101_original.jsonl"
args.num_train_samples = -1 # -1 for all the samples
args.num_eval_samples = -1
args.max_new_tokens = 512
args.eval_output_dir = "./eval_results_injection"
args.openai_config_path = "evaluator/openai_config.yaml"
args.evaluator_type = "GenericLLMEvaluator"
args.judge_model_name = "gpt-4"
args.judge_system_prompt = None
args.batch_size = 5
args.num_epochs = 10 # TODO: Change this later

# Arguments for Adapter Injection
args.do_adapter_injection = True
args.adapter_layers_json = """[{"name": "model.layers.15.mlp.gate_proj"}, {"name": "model.layers.15.mlp.up_proj"}]"""
args.adapter_params_json = '{"in_features": 4096, "reduction_factor": 16}'
args.perform_adapter_training = True
args.adapter_lr = 1e-4
args.adapter_epochs = 1

# New argument for full fine-tuning
args.perform_full_finetune = True


In [5]:
args.train_dataset_path

'evaluator/benchmark_datasets/new_datasamples.jsonl'

In [6]:


os.makedirs(args.eval_output_dir, exist_ok=True)

with open("config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Load and prepare train_dataset
logger.info(f"Loading training dataset from {args.train_dataset_path}...")
try:
    train_dataset_list = []
    with open(args.train_dataset_path, 'r') as f:
        for line in f:
            train_dataset_list.append(json.loads(line))
    
    if args.num_train_samples != -1 and args.num_train_samples < len(train_dataset_list):
        train_dataset_list = train_dataset_list[:args.num_train_samples]
    
    train_dataset = Dataset.from_list(train_dataset_list)
    logger.info(f"Training dataset loaded. Samples: {len(train_dataset)}")

except Exception as e:
    logger.error(f"Error loading training dataset: {e}", exc_info=True)

# Load and prepare eval_dataset
logger.info(f"Loading evaluation dataset from {args.eval_dataset_path}...")

INFO:__main__:Loading training dataset from evaluator/benchmark_datasets/new_datasamples.jsonl...
INFO:__main__:Training dataset loaded. Samples: 50
INFO:__main__:Loading evaluation dataset from evaluator/benchmark_datasets/mtbench101_original.jsonl...


In [7]:
try:
    eval_dataset_list = []
    with open(args.eval_dataset_path, 'r') as f:
        for line in f:
            eval_dataset_list.append(json.loads(line))

    if args.num_eval_samples != -1 and args.num_eval_samples < len(eval_dataset_list):
        eval_dataset_list = eval_dataset_list[:args.num_eval_samples]

    eval_dataset = Dataset.from_list(eval_dataset_list)
    logger.info(f"Evaluation dataset loaded. Samples: {len(eval_dataset)}")

except Exception as e:
    logger.error(f"Error loading evaluation dataset: {e}", exc_info=True)
    

# 2. Load Tokenizer (shared for both models)
try:
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token # Set pad token if not present
        logger.info(f"Tokenizer pad_token was None, set to eos_token: {tokenizer.eos_token}")
except Exception as e:
    logger.error(f"Failed to load tokenizer for {args.model_name_or_path}: {e}", exc_info=True)

INFO:__main__:Evaluation dataset loaded. Samples: 1388
INFO:__main__:Tokenizer pad_token was None, set to eos_token: </s>


In [8]:
eval_dataset[0]

{'task': 'GR',
 'id': 1,
 'history': [{'bot': 'Based on the given information, A is the tallest among the three people.',
   'user': 'Now there are three people A, B and C. I currently know that A is taller than B and B is taller than C. Who is the tallest currently?'},
  {'bot': 'Based on the additional information, it is not possible to determine who the tallest person is at the moment. The given information only states the relative heights between the individuals but does not provide any specific comparison between A, C, D, and E.',
   'user': 'Now there are two more people, D and E. D is higher than B and E is higher than D. Who is the tallest at the moment?'},
  {'bot': 'Based on the updated information, if D is higher than A and A is already known to be taller than B and B is taller than C, E is higher than D. then E would be the tallest person at the moment.',
   'user': 'Now, I know that D is higher than A. Who is the highest now?'}]}

In [9]:
args.model_name_or_path

'mistralai/Mistral-7B-Instruct-v0.2'

In [10]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def format_and_tokenize_dataset(dataset_hf, tokenizer, max_seq_length):
    """
    Formats the conversational history and tokenizes it for Mistral instruction fine-tuning.
    Masks prompt tokens in the labels.

    Args:
        dataset_hf (datasets.Dataset): The Hugging Face dataset.
                                       Expected to have a 'history' column.
        tokenizer: The tokenizer.
        max_seq_length (int): Maximum sequence length for truncation.

    Returns:
        dict: A dictionary containing 'input_ids', 'attention_mask', and 'labels'.
    """
    processed_examples = {'input_ids': [], 'attention_mask': [], 'labels': []}

    # Determine BOS and EOS tokens from the tokenizer
    bos = tokenizer.bos_token if tokenizer.bos_token else "<s>"
    eos = tokenizer.eos_token if tokenizer.eos_token else "</s>"
    inst_open = "[INST]"
    inst_close = "[/INST]"

    logger.info(f"Using BOS: '{bos}', EOS: '{eos}' for formatting.")

    for item_idx, item_history in enumerate(dataset_hf['history']):
        full_concatenated_input_ids = []
        full_concatenated_labels = []

        if not isinstance(item_history, list):
            logger.warning(f"Item at index {item_idx} has history of type {type(item_history)}, expected list. Skipping.")
            continue

        for turn_idx, turn in enumerate(item_history):
            if not isinstance(turn, dict) or 'user' not in turn or 'bot' not in turn:
                logger.warning(f"Turn {turn_idx} in item {item_idx} is malformed: {turn}. Skipping turn.")
                continue
            
            user_query = str(turn['user'])
            bot_response = str(turn['bot'])

            # Format for Mistral: <s>[INST] User Query [/INST] Bot Response</s>
            # Note: A space is often added before the bot_response if not handled by tokenizer.
            prompt_str = f"{bos}{inst_open} {user_query} {inst_close}"
            answer_str = f" {bot_response}{eos}" # Leading space for the answer part

            # Tokenize prompt and answer parts separately to correctly create labels
            # add_special_tokens=False because we are manually adding BOS/EOS per turn segment
            prompt_tokens = tokenizer.encode(prompt_str, add_special_tokens=False)
            answer_tokens = tokenizer.encode(answer_str, add_special_tokens=False)
            
            current_turn_input_ids = prompt_tokens + answer_tokens
            # For labels, mask the prompt part by setting tokens to -100
            current_turn_labels = [-100] * len(prompt_tokens) + answer_tokens
            
            full_concatenated_input_ids.extend(current_turn_input_ids)
            full_concatenated_labels.extend(current_turn_labels)

        # Truncate if the full concatenated history exceeds max_seq_length
        if len(full_concatenated_input_ids) > max_seq_length:
            full_concatenated_input_ids = full_concatenated_input_ids[:max_seq_length]
            full_concatenated_labels = full_concatenated_labels[:max_seq_length]
        elif len(full_concatenated_input_ids) == 0: # Handle empty history cases
            logger.warning(f"Item at index {item_idx} resulted in empty tokenized output. Skipping.")
            continue
            
        # Create attention mask (1 for real tokens, 0 for padding - padding handled by collator)
        attention_mask = [1] * len(full_concatenated_input_ids)

        processed_examples['input_ids'].append(full_concatenated_input_ids)
        processed_examples['attention_mask'].append(attention_mask)
        processed_examples['labels'].append(full_concatenated_labels)
        
    return processed_examples


class ConversationDataset(Dataset):
    """PyTorch Dataset for conversational data."""
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.labels = tokenized_data['labels']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

def collate_fn_conversations(batch, tokenizer):
    """Collate function to pad batch elements to the same length."""
    input_ids_list = [item['input_ids'] for item in batch]
    attention_mask_list = [item['attention_mask'] for item in batch]
    labels_list = [item['labels'] for item in batch]

    # Determine max length in this batch for padding
    max_len = max(len(ids) for ids in input_ids_list)
    if max_len == 0: # Should not happen if empty examples are filtered
        return None 

    padded_input_ids = []
    padded_attention_mask = []
    padded_labels = []

    pad_token_id = tokenizer.pad_token_id
    if pad_token_id is None:
        # Fallback if pad_token_id is not set, common to use eos_token_id
        pad_token_id = tokenizer.eos_token_id 
        logger.warning(f"tokenizer.pad_token_id is None. Using eos_token_id ({pad_token_id}) for padding.")
        if pad_token_id is None: # Critical error if no pad token can be determined
             raise ValueError("Tokenizer has no pad_token_id and no eos_token_id to use as fallback for padding.")


    for i in range(len(batch)):
        input_ids = input_ids_list[i]
        attention_mask = attention_mask_list[i]
        labels = labels_list[i]
        
        padding_length = max_len - len(input_ids)
        
        # Pad right
        padded_input_ids.append(torch.cat([input_ids, torch.full((padding_length,), pad_token_id, dtype=torch.long)]))
        padded_attention_mask.append(torch.cat([attention_mask, torch.full((padding_length,), 0, dtype=torch.long)])) # Pad attention mask with 0
        padded_labels.append(torch.cat([labels, torch.full((padding_length,), -100, dtype=torch.long)])) # Pad labels with -100 (ignore index)

    return {
        "input_ids": torch.stack(padded_input_ids),
        "attention_mask": torch.stack(padded_attention_mask),
        "labels": torch.stack(padded_labels)
    }




## Function train mistral model 

In [11]:
original_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

original_model.to(device)
original_model.train() # Set model to training mode

# Ensure tokenizer has a pad token. This is crucial for batching.
if tokenizer.pad_token is None:
    logger.warning("Tokenizer does not have a pad_token. Setting pad_token to eos_token.")
    tokenizer.pad_token = tokenizer.eos_token
    # Important: If you add a new token or change pad_token such that vocab size changes,
    # you might need to resize model token embeddings:
    # original_model.resize_token_embeddings(len(tokenizer))
    # However, just setting pad_token = eos_token usually means using an existing token.
if tokenizer.pad_token_id is None: # Ensure pad_token_id is also set
    tokenizer.pad_token_id = tokenizer.eos_token_id

INFO:__main__:Using device: cuda


In [13]:
logger.info(f"Tokenizer pad token ID: {tokenizer.pad_token_id}")

train_dataset_hf = train_dataset




INFO:__main__:Tokenizer pad token ID: 2


In [34]:
type(train_dataset)

datasets.arrow_dataset.Dataset

In [14]:
args.max_seq_length = 1024

In [15]:
format_and_tokenize_dataset(train_dataset_hf, tokenizer, args.max_seq_length)

INFO:__main__:Using BOS: '<s>', EOS: '</s>' for formatting.


{'input_ids': [[1,
   28792,
   16289,
   28793,
   315,
   506,
   1712,
   18683,
   28747,
   264,
   5014,
   16544,
   28725,
   264,
   9746,
   16544,
   28725,
   304,
   264,
   20067,
   16544,
   28723,
   315,
   3395,
   272,
   5014,
   16544,
   298,
   586,
   5510,
   28723,
   9595,
   18683,
   511,
   315,
   506,
   1749,
   28804,
   733,
   28748,
   16289,
   28793,
   2530,
   5239,
   272,
   5014,
   16544,
   298,
   574,
   5510,
   28725,
   368,
   506,
   272,
   9746,
   16544,
   304,
   272,
   20067,
   16544,
   1749,
   28723,
   2,
   1,
   28792,
   16289,
   28793,
   1047,
   315,
   868,
   1300,
   1698,
   20067,
   16544,
   304,
   1840,
   378,
   28725,
   910,
   1287,
   20067,
   18683,
   511,
   315,
   506,
   1055,
   28804,
   733,
   28748,
   16289,
   28793,
   4577,
   368,
   4558,
   272,
   3493,
   20067,
   16544,
   304,
   1419,
   1698,
   624,
   28725,
   368,
   1055,
   506,
   989,
   20067,
   18683,
   28723,
 

In [16]:
# 1. Preprocess and tokenize the dataset
logger.info("Preprocessing and tokenizing dataset...")
tokenized_data_dict = format_and_tokenize_dataset(train_dataset_hf, tokenizer, args.max_seq_length)

if not tokenized_data_dict['input_ids']:
    logger.error("Tokenization resulted in an empty dataset. Please check your data and formatting.")
    

# Create a PyTorch Dataset
pytorch_train_dataset = ConversationDataset(tokenized_data_dict)
logger.info(f"Created PyTorch Dataset with {len(pytorch_train_dataset)} examples.")

INFO:__main__:Preprocessing and tokenizing dataset...
INFO:__main__:Using BOS: '<s>', EOS: '</s>' for formatting.
INFO:__main__:Created PyTorch Dataset with 50 examples.


In [17]:
len(pytorch_train_dataset)

50

In [18]:
# 2. Create DataLoader
logger.info(f"Creating DataLoader with batch size: {args.batch_size}...")
train_dataloader = DataLoader(
    pytorch_train_dataset,
    batch_size=args.batch_size,
    shuffle=True,
    collate_fn=lambda batch: collate_fn_conversations(batch, tokenizer)
)


INFO:__main__:Creating DataLoader with batch size: 5...


In [19]:
next(iter(train_dataloader))['input_ids'].shape

torch.Size([5, 304])

In [20]:
args.learning_rate = 0.0001

In [21]:
# 3. Set up Optimizer and Scheduler
logger.info(f"Setting up optimizer with learning rate: {args.learning_rate}...")
optimizer = AdamW(original_model.parameters(), lr=args.learning_rate, eps=1e-8) # Added eps for stability

gradient_accumulation_steps = getattr(args, 'gradient_accumulation_steps', 1)
num_training_steps_per_epoch = len(train_dataloader) // gradient_accumulation_steps
if len(train_dataloader) % gradient_accumulation_steps != 0:
    num_training_steps_per_epoch +=1 # account for the last partial step

total_training_steps = num_training_steps_per_epoch * args.num_epochs

num_warmup_steps = getattr(args, 'warmup_steps', 0)
if isinstance(num_warmup_steps, float): # if warmup_steps is a ratio
    num_warmup_steps = int(total_training_steps * num_warmup_steps)

logger.info(f"Total training steps: {total_training_steps}, Warmup steps: {num_warmup_steps}")

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_training_steps
)

logging_steps = getattr(args, 'logging_steps', 10)

INFO:__main__:Setting up optimizer with learning rate: 0.0001...
INFO:__main__:Total training steps: 100, Warmup steps: 0


In [22]:
def train_model_mistral(original_model, tokenizer, train_dataset_hf, args):
    """
    Trains a Mistral model using the provided dataset and arguments.

    Args:
        original_model: The pre-trained Mistral model (e.g., from AutoModelForCausalLM.from_pretrained).
        tokenizer: The tokenizer for the model (e.g., from AutoTokenizer.from_pretrained).
        train_dataset_hf (datasets.Dataset): The Hugging Face training dataset.
                                            Must contain a 'history' column, where each item is a list of turns,
                                            and each turn is a dict {'user': str, 'bot': str}.
        args: An object or Namespace containing training arguments:
              - num_epochs (int): Number of training epochs.
              - model_save_path (str): Path to save the fine-tuned model and tokenizer.
              - learning_rate (float): Optimizer learning rate (e.g., 2e-5, 5e-5).
              - batch_size (int): Training batch size (e.g., 1, 2, 4, adjust based on GPU memory).
              - max_seq_length (int): Maximum sequence length for tokenization and padding (e.g., 512, 1024, 2048).
              - gradient_accumulation_steps (int, optional): Number of steps to accumulate gradients before an optimizer update. Defaults to 1.
              - warmup_steps (int, optional): Number of warmup steps for the learning rate scheduler. Defaults to 0.
              - logging_steps (int, optional): Log training loss every X steps. Defaults to 10.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
    
    original_model.to(device)
    original_model.train() # Set model to training mode

    # Ensure tokenizer has a pad token. This is crucial for batching.
    if tokenizer.pad_token is None:
        logger.warning("Tokenizer does not have a pad_token. Setting pad_token to eos_token.")
        tokenizer.pad_token = tokenizer.eos_token
        # Important: If you add a new token or change pad_token such that vocab size changes,
        # you might need to resize model token embeddings:
        # original_model.resize_token_embeddings(len(tokenizer))
        # However, just setting pad_token = eos_token usually means using an existing token.
    if tokenizer.pad_token_id is None: # Ensure pad_token_id is also set
        tokenizer.pad_token_id = tokenizer.eos_token_id

    logger.info(f"Tokenizer pad token ID: {tokenizer.pad_token_id}")


    # 1. Preprocess and tokenize the dataset
    logger.info("Preprocessing and tokenizing dataset...")
    tokenized_data_dict = format_and_tokenize_dataset(train_dataset_hf, tokenizer, args.max_seq_length)
    
    if not tokenized_data_dict['input_ids']:
        logger.error("Tokenization resulted in an empty dataset. Please check your data and formatting.")
        return None

    # Create a PyTorch Dataset
    pytorch_train_dataset = ConversationDataset(tokenized_data_dict)
    logger.info(f"Created PyTorch Dataset with {len(pytorch_train_dataset)} examples.")


    # 2. Create DataLoader
    logger.info(f"Creating DataLoader with batch size: {args.batch_size}...")
    train_dataloader = DataLoader(
        pytorch_train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        collate_fn=lambda batch: collate_fn_conversations(batch, tokenizer)
    )

    # 3. Set up Optimizer and Scheduler
    logger.info(f"Setting up optimizer with learning rate: {args.learning_rate}...")

    # Freeze all parameters
    for name, param in original_model.named_parameters():
        param.requires_grad = False

    # Unfreeze only adapter layers based on name match
    adapter_layer_prefixes = [layer['name'] for layer in args.adapter_layers_json]

    for name, param in original_model.named_parameters():
        for prefix in adapter_layer_prefixes:
            if name.startswith(prefix):
                param.requires_grad = True
                print(name)
                break

    # Only pass trainable parameters to the optimizer
    optimizer = AdamW(filter(lambda p: p.requires_grad, original_model.parameters()), lr=args.learning_rate, eps=1e-8) # Added eps for stability

    gradient_accumulation_steps = getattr(args, 'gradient_accumulation_steps', 1)
    num_training_steps_per_epoch = len(train_dataloader) // gradient_accumulation_steps
    if len(train_dataloader) % gradient_accumulation_steps != 0:
        num_training_steps_per_epoch +=1 # account for the last partial step

    total_training_steps = num_training_steps_per_epoch * args.num_epochs
    
    num_warmup_steps = getattr(args, 'warmup_steps', 0)
    if isinstance(num_warmup_steps, float): # if warmup_steps is a ratio
        num_warmup_steps = int(total_training_steps * num_warmup_steps)

    logger.info(f"Total training steps: {total_training_steps}, Warmup steps: {num_warmup_steps}")
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=total_training_steps
    )
    
    logging_steps = getattr(args, 'logging_steps', 10)

    # 4. Training Loop
    logger.info(f"Starting training for {args.num_epochs} epochs...")
    original_model.zero_grad() # Clear gradients before starting

    for epoch in range(args.num_epochs):
        logger.info(f"--- Epoch {epoch+1}/{args.num_epochs} ---")
        epoch_total_loss = 0.0
        
        for step, batch in enumerate(train_dataloader):
            if batch is None: # Skip if collate_fn returned None (e.g. empty batch after filtering)
                continue

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = original_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps # Normalize loss
            
            # Backward pass
            loss.backward()
            
            # Optimizer step (with gradient accumulation)
            if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
                torch.nn.utils.clip_grad_norm_(original_model.parameters(), 1.0) # Gradient clipping
                optimizer.step()
                scheduler.step() # Update learning rate
                optimizer.zero_grad() # Clear gradients for the next accumulation

            epoch_total_loss += loss.item() * gradient_accumulation_steps # De-normalize for logging

            

            start_time = time.time()

            if (step + 1) % (logging_steps * gradient_accumulation_steps) == 0:
                current_lr = scheduler.get_last_lr()[0]
                elapsed = time.time() - start_time
                step_time = elapsed * 1000 / logging_steps  # in ms
                tokens_per_second = int(input_ids.numel() * logging_steps / elapsed)

                grad_norm = 0.0
                for p in original_model.parameters():
                    if p.requires_grad and p.grad is not None:
                        grad_norm += p.grad.data.norm(2).item() ** 2
                grad_norm = grad_norm ** 0.5

                print(
                    f"step {step+1}/{len(train_dataloader)} | "
                    f"loss {loss.item() * gradient_accumulation_steps:.6f} (+nanz)| "
                    f"norm {grad_norm:.4f} (+nanz)| "
                    f"lr {current_lr:.2e} | "
                    f"{step_time:.2f} ms | "
                    f"{tokens_per_second} tok/s",
                    flush=True
                )

                start_time = time.time()


        avg_epoch_loss = epoch_total_loss / len(train_dataloader)
        logger.info(f"--- End of Epoch {epoch+1}, Average Loss: {avg_epoch_loss:.4f} ---")

    # 5. Save Model
    logger.info(f"Training finished. Saving model to {args.model_save_path}...")
    if not os.path.exists(args.model_save_path):
        os.makedirs(args.model_save_path)
        
    original_model.save_pretrained(args.model_save_path)
    tokenizer.save_pretrained(args.model_save_path)
    logger.info(f"Model and tokenizer saved successfully to {args.model_save_path}")

    return original_model


In [23]:
import yaml

config_file_path = "config/config.yaml"

with open(config_file_path, 'r') as f:
    config_data = yaml.safe_load(f)

args.adapter_layers_json = config_data.get('adapter', {}).get('layers')

print(f"Loaded adapter layers from {config_file_path}:")
print(args.adapter_layers_json)

Loaded adapter layers from config/config.yaml:
[{'name': 'model.layers.31.self_attn.q_proj'}, {'name': 'model.layers.31.self_attn.k_proj'}, {'name': 'model.layers.31.self_attn.v_proj'}, {'name': 'model.layers.31.self_attn.o_proj'}]


In [24]:
adapter_layer_prefixes = [layer['name'] for layer in args.adapter_layers_json]

for name, param in original_model.named_parameters():
    for prefix in adapter_layer_prefixes:
        if name.startswith(prefix):
            param.requires_grad = True
            print(name)
            break


model.layers.31.self_attn.q_proj.weight
model.layers.31.self_attn.k_proj.weight
model.layers.31.self_attn.v_proj.weight
model.layers.31.self_attn.o_proj.weight


In [26]:
args.adapter_layers_json

[{'name': 'model.layers.31.self_attn.q_proj'},
 {'name': 'model.layers.31.self_attn.k_proj'},
 {'name': 'model.layers.31.self_attn.v_proj'},
 {'name': 'model.layers.31.self_attn.o_proj'}]

In [28]:
import time

In [31]:
args.model_save_path = "/common/home/users/s/sasikayw/scratchDirectory/VLM_DCT_Adapter/output/"

In [32]:
train_model_mistral(original_model, tokenizer, train_dataset_hf, args)

INFO:__main__:Using device: cuda
INFO:__main__:Tokenizer pad token ID: 2
INFO:__main__:Preprocessing and tokenizing dataset...
INFO:__main__:Using BOS: '<s>', EOS: '</s>' for formatting.
INFO:__main__:Created PyTorch Dataset with 50 examples.
INFO:__main__:Creating DataLoader with batch size: 5...
INFO:__main__:Setting up optimizer with learning rate: 0.0001...
INFO:__main__:Total training steps: 100, Warmup steps: 0
INFO:__main__:Starting training for 10 epochs...
INFO:__main__:--- Epoch 1/10 ---


model.layers.31.self_attn.q_proj.weight
model.layers.31.self_attn.k_proj.weight
model.layers.31.self_attn.v_proj.weight
model.layers.31.self_attn.o_proj.weight
step 10/10 | loss 0.110779 (+nanz)| norm 0.0000 (+nanz)| lr 9.00e-05 | 0.00 ms | 3407872000 tok/s


INFO:__main__:--- End of Epoch 1, Average Loss: 0.0616 ---
INFO:__main__:--- Epoch 2/10 ---


step 10/10 | loss 0.090800 (+nanz)| norm 0.0000 (+nanz)| lr 8.00e-05 | 0.00 ms | 3924670171 tok/s


INFO:__main__:--- End of Epoch 2, Average Loss: 0.1032 ---
INFO:__main__:--- Epoch 3/10 ---


step 10/10 | loss 0.031128 (+nanz)| norm 0.0000 (+nanz)| lr 7.00e-05 | 0.00 ms | 3836554541 tok/s


INFO:__main__:--- End of Epoch 3, Average Loss: 0.0579 ---
INFO:__main__:--- Epoch 4/10 ---


step 10/10 | loss 0.059750 (+nanz)| norm 0.0000 (+nanz)| lr 6.00e-05 | 0.00 ms | 1468006400 tok/s


INFO:__main__:--- End of Epoch 4, Average Loss: 0.0331 ---
INFO:__main__:--- Epoch 5/10 ---


step 10/10 | loss 0.023130 (+nanz)| norm 0.0000 (+nanz)| lr 5.00e-05 | 0.00 ms | 3207408941 tok/s


INFO:__main__:--- End of Epoch 5, Average Loss: 0.0191 ---
INFO:__main__:--- Epoch 6/10 ---


step 10/10 | loss 0.006734 (+nanz)| norm 0.0000 (+nanz)| lr 4.00e-05 | 0.00 ms | 2115656282 tok/s


INFO:__main__:--- End of Epoch 6, Average Loss: 0.0064 ---
INFO:__main__:--- Epoch 7/10 ---


step 10/10 | loss 0.001225 (+nanz)| norm 0.0000 (+nanz)| lr 3.00e-05 | 0.00 ms | 1408855958 tok/s


INFO:__main__:--- End of Epoch 7, Average Loss: 0.0039 ---
INFO:__main__:--- Epoch 8/10 ---


step 10/10 | loss 0.003810 (+nanz)| norm 0.0000 (+nanz)| lr 2.00e-05 | 0.00 ms | 940441600 tok/s


INFO:__main__:--- End of Epoch 8, Average Loss: 0.0059 ---
INFO:__main__:--- Epoch 9/10 ---


step 10/10 | loss 0.000429 (+nanz)| norm 0.0000 (+nanz)| lr 1.00e-05 | 0.00 ms | 1628376847 tok/s


INFO:__main__:--- End of Epoch 9, Average Loss: 0.0006 ---
INFO:__main__:--- Epoch 10/10 ---


step 10/10 | loss 0.000583 (+nanz)| norm 0.0000 (+nanz)| lr 0.00e+00 | 0.00 ms | 1717043200 tok/s


INFO:__main__:--- End of Epoch 10, Average Loss: 0.0006 ---
INFO:__main__:Training finished. Saving model to /common/home/users/s/sasikayw/scratchDirectory/VLM_DCT_Adapter/output/...


KeyboardInterrupt: 

---

In [None]:
# --- Full Fine-tuning of Original Model ---
if args.perform_full_finetune:
    logger.info("Performing full fine-tuning of the original model...")
    original_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
    # Use your existing train_model function for full fine-tuning
    original_model = train_model_mistral(original_model, tokenizer, train_dataset, args)
    # Save the fine-tuned original model
    finetuned_original_dir = os.path.join(args.eval_output_dir, "finetuned_original_model")
    os.makedirs(finetuned_original_dir, exist_ok=True)
    original_model.save_pretrained(finetuned_original_dir)
    tokenizer.save_pretrained(finetuned_original_dir)
    logger.info(f"Full fine-tuning finished. Model saved to {finetuned_original_dir}")
else:
    original_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)

In [None]:
train_dataset

In [None]:
train_dataset[0]

In [18]:
tokenizer

In [19]:
args.perform_full_finetune

In [20]:
isinstance(train_dataset, dict)

In [21]:
import torch.optim as optim

In [22]:
lambda p: p.requires_grad, original_model.parameters()

In [23]:
eval_dataset[0]

In [27]:
args.max_length = 512

In [29]:
eval_dataset[-1]['history']

In [24]:
from transformers import AutoTokenizer

def preprocess(sample):
    history = sample['history']
    
    # Build the conversation prompt by combining previous turns
    conversation = ""
    for turn in history:
        conversation += f"User: {turn['user']}\nBot: {turn['bot']}\n"

    # Optionally truncate to the last N turns
    input_text = conversation.strip()
    
    # Tokenize
    tokenized = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=args.max_length,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()  # if you're training in autoregressive setup

    return {k: v.squeeze(0) for k, v in tokenized.items()}


In [25]:
original_model.parameters

In [26]:
optim.Adam()

In [43]:
print("hello")

In [None]:
args.per_device_train_batch_size = 5

In [None]:
args.learning_rate = 0.0001

In [None]:
args.num_train_epochs = 5

## train_model_mistral function

In [17]:

device = "cuda" if torch.cuda.is_available() else "cpu"
original_model.to(device)
original_model.train()

# Optimizer
optimizer = optim.Adam(filter(lambda p: p.requires_grad, original_model.parameters()), lr=args.learning_rate)

batch_size = args.per_device_train_batch_size
epochs = int(args.num_train_epochs)

In [None]:
epoch = 0
pbar = tqdm(range(0, len(train_dataset), batch_size), desc=f"Mistral Training Epoch {epoch+1}/{epochs}")

In [56]:
device = "cuda" if torch.cuda.is_available() else "cpu"
original_model.to(device)
original_model.train()

# Optimizer
optimizer = optim.Adam(filter(lambda p: p.requires_grad, original_model.parameters()), lr=args.learning_rate)


In [57]:
batch_size = args.per_device_train_batch_size
epochs = int(args.num_train_epochs)

In [None]:
eval_dataset = 

In [55]:
train_dataset[0]

In [62]:
for i in pbar:
    batch_samples = train_dataset[i:i+batch_size]
    print(batch_samples)
    break

In [64]:
train_dataset[:2]

In [None]:
import torch

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def format_and_tokenize_dataset(dataset_hf, tokenizer, max_seq_length):
    """
    Formats the conversational history and tokenizes it for Mistral instruction fine-tuning.
    Masks prompt tokens in the labels.

    Args:
        dataset_hf (datasets.Dataset): The Hugging Face dataset.
                                       Expected to have a 'history' column.
        tokenizer: The tokenizer.
        max_seq_length (int): Maximum sequence length for truncation.

    Returns:
        dict: A dictionary containing 'input_ids', 'attention_mask', and 'labels'.
    """
    processed_examples = {'input_ids': [], 'attention_mask': [], 'labels': []}

    # Determine BOS and EOS tokens from the tokenizer
    bos = tokenizer.bos_token if tokenizer.bos_token else "<s>"
    eos = tokenizer.eos_token if tokenizer.eos_token else "</s>"
    inst_open = "[INST]"
    inst_close = "[/INST]"

    logger.info(f"Using BOS: '{bos}', EOS: '{eos}' for formatting.")

    for item_idx, item_history in enumerate(dataset_hf['history']):
        full_concatenated_input_ids = []
        full_concatenated_labels = []

        if not isinstance(item_history, list):
            logger.warning(f"Item at index {item_idx} has history of type {type(item_history)}, expected list. Skipping.")
            continue

        for turn_idx, turn in enumerate(item_history):
            if not isinstance(turn, dict) or 'user' not in turn or 'bot' not in turn:
                logger.warning(f"Turn {turn_idx} in item {item_idx} is malformed: {turn}. Skipping turn.")
                continue
            
            user_query = str(turn['user'])
            bot_response = str(turn['bot'])

            # Format for Mistral: <s>[INST] User Query [/INST] Bot Response</s>
            # Note: A space is often added before the bot_response if not handled by tokenizer.
            prompt_str = f"{bos}{inst_open} {user_query} {inst_close}"
            answer_str = f" {bot_response}{eos}" # Leading space for the answer part

            # Tokenize prompt and answer parts separately to correctly create labels
            # add_special_tokens=False because we are manually adding BOS/EOS per turn segment
            prompt_tokens = tokenizer.encode(prompt_str, add_special_tokens=False)
            answer_tokens = tokenizer.encode(answer_str, add_special_tokens=False)
            
            current_turn_input_ids = prompt_tokens + answer_tokens
            # For labels, mask the prompt part by setting tokens to -100
            current_turn_labels = [-100] * len(prompt_tokens) + answer_tokens
            
            full_concatenated_input_ids.extend(current_turn_input_ids)
            full_concatenated_labels.extend(current_turn_labels)

        # Truncate if the full concatenated history exceeds max_seq_length
        if len(full_concatenated_input_ids) > max_seq_length:
            full_concatenated_input_ids = full_concatenated_input_ids[:max_seq_length]
            full_concatenated_labels = full_concatenated_labels[:max_seq_length]
        elif len(full_concatenated_input_ids) == 0: # Handle empty history cases
            logger.warning(f"Item at index {item_idx} resulted in empty tokenized output. Skipping.")
            continue
            
        # Create attention mask (1 for real tokens, 0 for padding - padding handled by collator)
        attention_mask = [1] * len(full_concatenated_input_ids)

        processed_examples['input_ids'].append(full_concatenated_input_ids)
        processed_examples['attention_mask'].append(attention_mask)
        processed_examples['labels'].append(full_concatenated_labels)
        
    return processed_examples


class ConversationDataset(Dataset):
    """PyTorch Dataset for conversational data."""
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.labels = tokenized_data['labels']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

def collate_fn_conversations(batch, tokenizer):
    """Collate function to pad batch elements to the same length."""
    input_ids_list = [item['input_ids'] for item in batch]
    attention_mask_list = [item['attention_mask'] for item in batch]
    labels_list = [item['labels'] for item in batch]

    # Determine max length in this batch for padding
    max_len = max(len(ids) for ids in input_ids_list)
    if max_len == 0: # Should not happen if empty examples are filtered
        return None 

    padded_input_ids = []
    padded_attention_mask = []
    padded_labels = []

    pad_token_id = tokenizer.pad_token_id
    if pad_token_id is None:
        # Fallback if pad_token_id is not set, common to use eos_token_id
        pad_token_id = tokenizer.eos_token_id 
        logger.warning(f"tokenizer.pad_token_id is None. Using eos_token_id ({pad_token_id}) for padding.")
        if pad_token_id is None: # Critical error if no pad token can be determined
             raise ValueError("Tokenizer has no pad_token_id and no eos_token_id to use as fallback for padding.")


    for i in range(len(batch)):
        input_ids = input_ids_list[i]
        attention_mask = attention_mask_list[i]
        labels = labels_list[i]
        
        padding_length = max_len - len(input_ids)
        
        # Pad right
        padded_input_ids.append(torch.cat([input_ids, torch.full((padding_length,), pad_token_id, dtype=torch.long)]))
        padded_attention_mask.append(torch.cat([attention_mask, torch.full((padding_length,), 0, dtype=torch.long)])) # Pad attention mask with 0
        padded_labels.append(torch.cat([labels, torch.full((padding_length,), -100, dtype=torch.long)])) # Pad labels with -100 (ignore index)

    return {
        "input_ids": torch.stack(padded_input_ids),
        "attention_mask": torch.stack(padded_attention_mask),
        "labels": torch.stack(padded_labels)
    }


def train_model_mistral(original_model, tokenizer, train_dataset_hf, args):
    """
    Trains a Mistral model using the provided dataset and arguments.

    Args:
        original_model: The pre-trained Mistral model (e.g., from AutoModelForCausalLM.from_pretrained).
        tokenizer: The tokenizer for the model (e.g., from AutoTokenizer.from_pretrained).
        train_dataset_hf (datasets.Dataset): The Hugging Face training dataset.
                                            Must contain a 'history' column, where each item is a list of turns,
                                            and each turn is a dict {'user': str, 'bot': str}.
        args: An object or Namespace containing training arguments:
              - num_epochs (int): Number of training epochs.
              - model_save_path (str): Path to save the fine-tuned model and tokenizer.
              - learning_rate (float): Optimizer learning rate (e.g., 2e-5, 5e-5).
              - batch_size (int): Training batch size (e.g., 1, 2, 4, adjust based on GPU memory).
              - max_seq_length (int): Maximum sequence length for tokenization and padding (e.g., 512, 1024, 2048).
              - gradient_accumulation_steps (int, optional): Number of steps to accumulate gradients before an optimizer update. Defaults to 1.
              - warmup_steps (int, optional): Number of warmup steps for the learning rate scheduler. Defaults to 0.
              - logging_steps (int, optional): Log training loss every X steps. Defaults to 10.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
    
    original_model.to(device)
    original_model.train() # Set model to training mode

    # Ensure tokenizer has a pad token. This is crucial for batching.
    if tokenizer.pad_token is None:
        logger.warning("Tokenizer does not have a pad_token. Setting pad_token to eos_token.")
        tokenizer.pad_token = tokenizer.eos_token
        # Important: If you add a new token or change pad_token such that vocab size changes,
        # you might need to resize model token embeddings:
        # original_model.resize_token_embeddings(len(tokenizer))
        # However, just setting pad_token = eos_token usually means using an existing token.
    if tokenizer.pad_token_id is None: # Ensure pad_token_id is also set
        tokenizer.pad_token_id = tokenizer.eos_token_id

    logger.info(f"Tokenizer pad token ID: {tokenizer.pad_token_id}")


    # 1. Preprocess and tokenize the dataset
    logger.info("Preprocessing and tokenizing dataset...")
    tokenized_data_dict = format_and_tokenize_dataset(train_dataset_hf, tokenizer, args.max_seq_length)
    
    if not tokenized_data_dict['input_ids']:
        logger.error("Tokenization resulted in an empty dataset. Please check your data and formatting.")
        return None

    # Create a PyTorch Dataset
    pytorch_train_dataset = ConversationDataset(tokenized_data_dict)
    logger.info(f"Created PyTorch Dataset with {len(pytorch_train_dataset)} examples.")


    # 2. Create DataLoader
    logger.info(f"Creating DataLoader with batch size: {args.batch_size}...")
    train_dataloader = DataLoader(
        pytorch_train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        collate_fn=lambda batch: collate_fn_conversations(batch, tokenizer)
    )

    # 3. Set up Optimizer and Scheduler
    logger.info(f"Setting up optimizer with learning rate: {args.learning_rate}...")
    optimizer = AdamW(original_model.parameters(), lr=args.learning_rate, eps=1e-8) # Added eps for stability
    
    gradient_accumulation_steps = getattr(args, 'gradient_accumulation_steps', 1)
    num_training_steps_per_epoch = len(train_dataloader) // gradient_accumulation_steps
    if len(train_dataloader) % gradient_accumulation_steps != 0:
        num_training_steps_per_epoch +=1 # account for the last partial step

    total_training_steps = num_training_steps_per_epoch * args.num_epochs
    
    num_warmup_steps = getattr(args, 'warmup_steps', 0)
    if isinstance(num_warmup_steps, float): # if warmup_steps is a ratio
        num_warmup_steps = int(total_training_steps * num_warmup_steps)

    logger.info(f"Total training steps: {total_training_steps}, Warmup steps: {num_warmup_steps}")
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=total_training_steps
    )
    
    logging_steps = getattr(args, 'logging_steps', 10)

    # 4. Training Loop
    logger.info(f"Starting training for {args.num_epochs} epochs...")
    original_model.zero_grad() # Clear gradients before starting

    for epoch in range(args.num_epochs):
        logger.info(f"--- Epoch {epoch+1}/{args.num_epochs} ---")
        epoch_total_loss = 0.0
        
        for step, batch in enumerate(train_dataloader):
            if batch is None: # Skip if collate_fn returned None (e.g. empty batch after filtering)
                continue

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = original_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps # Normalize loss
            
            # Backward pass
            loss.backward()
            
            # Optimizer step (with gradient accumulation)
            if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
                torch.nn.utils.clip_grad_norm_(original_model.parameters(), 1.0) # Gradient clipping
                optimizer.step()
                scheduler.step() # Update learning rate
                optimizer.zero_grad() # Clear gradients for the next accumulation

            epoch_total_loss += loss.item() * gradient_accumulation_steps # De-normalize for logging

            if (step + 1) % (logging_steps * gradient_accumulation_steps) == 0:
                current_lr = scheduler.get_last_lr()[0]
                logger.info(f"  Epoch {epoch+1}, Step {step+1}/{len(train_dataloader)}, Loss: {loss.item() * gradient_accumulation_steps:.4f}, LR: {current_lr:.2e}")

        avg_epoch_loss = epoch_total_loss / len(train_dataloader)
        logger.info(f"--- End of Epoch {epoch+1}, Average Loss: {avg_epoch_loss:.4f} ---")

    # 5. Save Model
    logger.info(f"Training finished. Saving model to {args.model_save_path}...")
    if not os.path.exists(args.model_save_path):
        os.makedirs(args.model_save_path)
        
    original_model.save_pretrained(args.model_save_path)
    tokenizer.save_pretrained(args.model_save_path)
    logger.info(f"Model and tokenizer saved successfully to {args.model_save_path}")

    return original_model


In [30]:
import datasets
def train_model_mistral(model, tokenizer, train_dataset:datasets.arrow_dataset.Dataset, args):
    """
    model : PyTorch model 
    tokenizer : Mistral Model tokenizer 
    train_dataset : (HuggingFace Dataset Class)
    args : 
    """ 

    pass

In [None]:
args.system_message = "You are a model that corresponds to training a model " # TODO: Change this later

In [25]:
train_dataset

In [27]:
type(train_dataset)

In [32]:
args.lr = 0.0001

In [33]:
args

In [26]:
train_dataset[0]

In [19]:
# --- Evaluate Fine-tuned Original Model ---
run_evaluation_pipeline(
    model_name_or_path=args.model_name_or_path,
    model_to_evaluate=original_model,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset,
    args=args,
    output_suffix="original"
)
del original_model
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
# --- Adapter Injection and Evaluation for Adapted Model ---
logger.info(f"--- Starting Evaluation for Adapted Model: {args.model_name_or_path} + Adapter ---")
adapted_model = None 
model_for_adapted_eval_name = args.model_name_or_path 
args.do_adapter_injection = True
try:
    if args.do_adapter_injection:
        logger.info(f"Loading base model ({args.model_name_or_path}) for adapter injection...")
        adapted_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
        model_for_adapted_eval_name = f"{args.model_name_or_path}_adapted"
        model_arch_path = os.path.join(args.eval_output_dir, "model_archi.txt")
        try:
            with open(model_arch_path, 'w') as f:
                f.write(str(adapted_model))
            logger.info(f"Model architecture written to {model_arch_path}")
        except Exception as e:
            logger.error(f"Failed to write model architecture: {e}")
        print("Hello")
        try: 
            adapted_model = inject_adapters(adapted_model, DCTAdapter, config['adapter']['params'], config['adapter']['layers'])
            logger.info("Adapter injection process finished.")
            freeze_model_except_adapters(adapted_model)
            print("ADAPTER MODEL ARCHITECTURE")
            print(adapted_model)
        except Exception as e:
            print(e)
        if args.perform_adapter_training:
            if train_dataset is None or len(train_dataset) == 0:
                logger.warning("Adapter training requested, but train_dataset is empty or None. Skipping training.")
            else:
                logger.info(f"Performing adapter training using {len(train_dataset)} samples...")
                device = "cuda" if torch.cuda.is_available() else "cpu"
                adapted_model.to(device)
                adapted_model = train_model_with_adapter(adapted_model, tokenizer, train_dataset, args)
                logger.info("Adapter training finished.")
                # Save the adapter-injected model (adapter weights)
                finetuned_adapter_dir = os.path.join(args.eval_output_dir, "finetuned_adapter_model")
                os.makedirs(finetuned_adapter_dir, exist_ok=True)
                # Save the full model (including adapters)
                adapted_model.save_pretrained(finetuned_adapter_dir)
                tokenizer.save_pretrained(finetuned_adapter_dir)
                logger.info(f"Adapter-injected model saved to {finetuned_adapter_dir}")
        else:
            logger.info("Adapter training not requested (perform_adapter_training=False).")
    else:
        logger.info("Adapter injection not requested (do_adapter_injection=False). Evaluating base model as 'adapted' model.")
    if adapted_model: 
        print("Evaluation of adapted model--------------------")
        run_evaluation_pipeline(
            model_name_or_path=model_for_adapted_eval_name, 
            model_to_evaluate=adapted_model,
            tokenizer=tokenizer,
            eval_dataset=eval_dataset,
            args=args,
            output_suffix="adapted" 
        )
    else:
        logger.error("Adapted model was not loaded or created. Skipping evaluation for adapted model.")
    del adapted_model 
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
except Exception as e:
    logger.error(f"Error during adapted model setup or evaluation: {e}", exc_info=True)

logger.info("--- Main Mistral Injection Script Finished ---")
