In [None]:
# I want to fine-tune a meta-llama/Llama-3.2-1B model to do credit card fraud detection. Log everything to wandb.
# The Hugging Face dataset for legitimate transactions is called "yunfan-y/fraud-detection-legitimate"
# The Hugging Face dataset for fraudulent transactions is called "yunfan-y/fraud-detection-fraud"
# All datasets have been split into train, validation, and test sets.
# All datasets have columns "conversation" and "response"
# The response is either "LEGITIMATE" or "FRAUD"

# Here is a sample data:

# conversation: Transaction Details: - Date/Time: 2019-05-26 05:20:36 - Merchant: fraud_Romaguera, Cruickshank and Greenholt - Amount: $104.9 - Category: shopping_net - Gender: M - State: OR
# response: LEGITIMATE

# After the model is trained, I want to evaluate the model on the test set.

import wandb
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets

# Import PEFT and LoRA
from peft import get_peft_model, LoraConfig, TaskType

# Initialize wandb
wandb.init(project='llama-fraud-detection', name='fine-tuning-lora')

# Load datasets
legitimate_dataset = load_dataset('yunfan-y/fraud-detection-legitimate')
fraudulent_dataset = load_dataset('yunfan-y/fraud-detection-fraud')

# Merge the legitimate and fraudulent datasets
def merge_datasets(legit, fraud):
    # Shuffle both datasets
    legit = legit.shuffle(seed=42)
    fraud = fraud.shuffle(seed=42)
    
    # Concatenate datasets directly
    return concatenate_datasets([legit, fraud])

train_dataset = merge_datasets(legitimate_dataset['train'], fraudulent_dataset['train'])
validation_dataset = merge_datasets(legitimate_dataset['validation'], fraudulent_dataset['validation'])
test_dataset = merge_datasets(legitimate_dataset['test'], fraudulent_dataset['test'])

# Load tokenizer and model
model_name = 'meta-llama/Llama-3.2-1B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Adjust tokenizer for special tokens
tokenizer.pad_token = tokenizer.eos_token

# Apply LoRA configuration to the model
lora_config = LoraConfig(
    r=16,  # Rank of the update matrices
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Target modules (adjust based on model architecture)
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM  # Task type for causal language modeling
)

model = get_peft_model(model, lora_config)

# Preprocessing function
def preprocess_function(examples):
    # Combine conversation and response
    prompts = [f"{conv}\nResponse: {resp}" for conv, resp in zip(examples['conversation'], examples['response'])]
    
    # Tokenize
    tokenized = tokenizer(
        prompts,
        padding='max_length',
        truncation=True,
        max_length=256,
    )
    
    # Create labels
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]
    labels = [ids.copy() for ids in input_ids]
    
    # Set padding tokens to -100 in labels
    for label_seq in labels:
        for i, token in enumerate(label_seq):
            if token == tokenizer.pad_token_id:
                label_seq[i] = -100
    
    tokenized["labels"] = labels
    return tokenized

# Preprocess datasets with batching
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=4  # Number of processes for multiprocessing
)
tokenized_validation = validation_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=validation_dataset.column_names,
    num_proc=4
)
tokenized_test = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=test_dataset.column_names,
    num_proc=4
)

# Convert to torch format
tokenized_train.set_format("torch")
tokenized_validation.set_format("torch")
tokenized_test.set_format("torch")

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=50,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Increased batch size
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to='wandb',  # Enable logging to wandb
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    fp16=True,  # Enable mixed precision training
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Evaluate the model on the test set
eval_results = trainer.evaluate(tokenized_test)
print(eval_results)

# Save the LoRA adapters
model.push_to_hub("yunfan-y/fraud-detection-fine-tune-lora")