In [1]:
# i want to finetune a model to do credit card fraud detection. A sample input is given below.

# the huggingface dataset for legitimate transactions is called "yunfan-y/fraud-detection-legitimate"

# the huggingface dataset for fraudulent transactions is called "yunfan-y/fraud-detection-fraud"

# all datasets have columns "conversation" and "response" 
# the response is either "LEGITIMATE" or "FRAUD"

# here is a sample data: 

# conversation: Transaction Details: - Date/Time: 2019-05-26 05:20:36 - Merchant: fraud_Romaguera, Cruickshank and Greenholt - Amount: $104.9 - Category: shopping_net - Gender: M - State: OR

# response: LEGITIMATE

In [2]:
# Import libraries
import datasets
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import wandb

# Import PEFT libraries for LoRA
from peft import get_peft_model, LoraConfig, TaskType

wandb.init(project="fraud_detection")

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maidenyang66[0m ([33myyfsss[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# Load legitimate transactions dataset
legitimate_dataset = load_dataset("yunfan-y/fraud-detection-legitimate")

# Load fraudulent transactions dataset
fraudulent_dataset = load_dataset("yunfan-y/fraud-detection-fraud")

In [4]:
# Assign label 0 to legitimate transactions
legitimate_dataset = legitimate_dataset.map(lambda x: {'label': 0})

# Assign label 1 to fraudulent transactions
fraudulent_dataset = fraudulent_dataset.map(lambda x: {'label': 1})


# Combine the datasets
train_dataset = concatenate_datasets([legitimate_dataset['train'], fraudulent_dataset['train']])
train_dataset = train_dataset.shuffle(seed=42)
eval_dataset = concatenate_datasets([legitimate_dataset['validation'], fraudulent_dataset['validation']])
eval_dataset = eval_dataset.shuffle(seed=42)
test_dataset = concatenate_datasets([legitimate_dataset['test'], fraudulent_dataset['test']])
test_dataset = test_dataset.shuffle(seed=42)


In [5]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(example):
    return tokenizer(example['conversation'], padding='max_length', truncation=True)

# Apply the tokenizer to the datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

In [6]:
# Load a pre-trained model for sequence classification
base_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # For sequence classification
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

# Wrap the base model with LoRA
model = get_peft_model(base_model, lora_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [8]:
training_args = TrainingArguments(
    output_dir='./results',            # Output directory
    evaluation_strategy='steps',       # Evaluate every N steps
    save_strategy='steps',             # Save the model every N steps
    eval_steps=2500,                    # Evaluation interval
    save_steps=5000,                    # Save interval
    num_train_epochs=3,                # Number of training epochs
    per_device_train_batch_size=32,    # Batch size for training
    per_device_eval_batch_size=32,     # Batch size for evaluation
    logging_dir='./logs',              # Directory for logs
    logging_steps=10,
    load_best_model_at_end=False,       # Load the best model when finished training
    report_to="wandb",                 
    
)



In [9]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  
)

# Train the model
trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
2500,0.0613,0.086611,0.973677,0.884868,0.716378,0.791759
5000,0.0559,0.061815,0.981583,0.930016,0.796272,0.857963
7500,0.023,0.052821,0.983071,0.902405,0.849534,0.875171


TrainOutput(global_step=8064, training_loss=0.09648835975565903, metrics={'train_runtime': 4477.4091, 'train_samples_per_second': 57.625, 'train_steps_per_second': 1.801, 'total_flos': 6.812077969907712e+16, 'train_loss': 0.09648835975565903, 'epoch': 3.0})

In [10]:
# Evaluate the model
evaluation_results = trainer.evaluate()

# Print evaluation results
print(evaluation_results)

{'eval_loss': 0.05265700817108154, 'eval_accuracy': 0.9834434006138963, 'eval_precision': 0.9075391180654339, 'eval_recall': 0.8495339547270306, 'eval_f1': 0.8775790921595599, 'eval_runtime': 73.1581, 'eval_samples_per_second': 146.956, 'eval_steps_per_second': 4.593, 'epoch': 3.0}


In [11]:


# upload model to huggingface
model.push_to_hub("yunfan-y/fraud-detection-model-lora-origin")


adapter_model.safetensors: 100%|██████████| 1.19M/1.19M [00:00<00:00, 2.60MB/s]


CommitInfo(commit_url='https://huggingface.co/yunfan-y/fraud-detection-model-lora-origin/commit/bc8db9c8cd041fe696301db077654fa30cd355b7', commit_message='Upload model', commit_description='', oid='bc8db9c8cd041fe696301db077654fa30cd355b7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yunfan-y/fraud-detection-model-lora-origin', endpoint='https://huggingface.co', repo_type='model', repo_id='yunfan-y/fraud-detection-model-lora-origin'), pr_revision=None, pr_num=None)