In [1]:
import random
import wandb
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

wandb.login()
wandb.init(project="fraud_detection_llama")

# Load legitimate transactions dataset
legit_datasets = load_dataset("yunfan-y/fraud-detection-legitimate")

# Load fraudulent transactions dataset
fraud_datasets = load_dataset("yunfan-y/fraud-detection-fraud")

# Combine the training sets
train_dataset = concatenate_datasets([legit_datasets['train'], fraud_datasets['train']])

# Combine the validation sets
validation_dataset = concatenate_datasets([legit_datasets['validation'], fraud_datasets['validation']])

# Combine the test sets
test_dataset = concatenate_datasets([legit_datasets['test'], fraud_datasets['test']])

  from .autonotebook import tqdm as notebook_tqdm





[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maidenyang66[0m ([33myyfsss[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:

train_size = len(train_dataset)
half_train_size = train_size // 2

# Randomly select half of the indices
random_indices = random.sample(range(train_size), half_train_size)

# Select the subset of the training dataset
train_dataset = train_dataset.select(random_indices)

In [3]:

from transformers import AutoTokenizer, LlamaForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType

# Update the model name to an existing one
model_name = "meta-llama/Llama-3.2-1B"

# Load the tokenizer using AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Assign the pad_token as eos_token (if not already set)
tokenizer.pad_token = tokenizer.eos_token

# Load the model with a sequence classification head
model = LlamaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    problem_type="single_label_classification",
)

# Set the pad_token_id in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Resize the token embeddings to match the tokenizer
model.resize_token_embeddings(len(tokenizer))

# Define the LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS,  # Set the task type for sequence classification
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 3,411,968 || all params: 1,239,230,464 || trainable%: 0.2753


In [4]:

def preprocess_function(examples):
    return tokenizer(
        examples['conversation'],
        truncation=True,
        padding='max_length',  # Ensures all sequences are the same length
        max_length=512,
    )

# Tokenize the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 43002/43002 [00:05<00:00, 7671.50 examples/s]


In [5]:

label_mapping = {
    "LEGITIMATE": 0,
    "FRAUD": 1
}

# Function to map responses to labels
def encode_labels(example):
    example['label'] = label_mapping[example['response']]
    return example

# Apply the mapping to the datasets
train_dataset = train_dataset.map(encode_labels)
validation_dataset = validation_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)

# Now rename 'label' to 'labels' and set the format for PyTorch
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

validation_dataset = validation_dataset.rename_column("label", "labels")
validation_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 43002/43002 [00:01<00:00, 25366.80 examples/s]


In [6]:

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [7]:

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=5000,
    save_steps=5000,
    logging_steps=100,
    num_train_epochs=1,  # Adjust the number of epochs as needed
    per_device_train_batch_size=8,  # Increase batch size due to reduced memory usage with LoRA
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-4,  # Adjust learning rate for LoRA
    logging_dir='./logs',
    report_to="wandb",
)



In [8]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [9]:

trainer.train()

eval_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Evaluation results: {eval_results}")

model.push_to_hub("yunfan-y/fraud-detection-fine-tune-with-lora")

wandb.log(eval_results)
wandb.finish()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  0%|          | 83/21501 [00:22<1:33:30,  3.82it/s]

KeyboardInterrupt: 