In [18]:
# i want to finetune a model to do credit card fraud detection. A sample input is given below.

# the huggingface dataset for legitimate transactions is called "LouisXO/fraud-detection-legitimate"

# the huggingface dataset for fraudulent transactions is called "LouisXO/fraud-detection-all-fraud"

# all datasets have columns "conversation" and "response" 
# the response is either "LEGITIMATE" or "FRAUD"

# here is a sample data: 

# conversation: Transaction Details: - Date/Time: 2019-05-26 05:20:36 - Merchant: fraud_Romaguera, Cruickshank and Greenholt - Amount: $104.9 - Category: shopping_net - Gender: M - State: OR

# response: LEGITIMATE



In [19]:
# Import libraries
import datasets
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

In [20]:
# Load legitimate transactions dataset
legitimate_dataset = load_dataset("LouisXO/fraud-detection-legitimate")

# Load fraudulent transactions dataset
fraudulent_dataset = load_dataset("LouisXO/fraud-detection-all-fraud")

In [21]:
# Assign label 0 to legitimate transactions
legitimate_dataset = legitimate_dataset.map(lambda x: {'label': 0})

# Assign label 1 to fraudulent transactions
fraudulent_dataset = fraudulent_dataset.map(lambda x: {'label': 1})

# Combine the datasets
full_dataset = concatenate_datasets([legitimate_dataset['train'], fraudulent_dataset['train']])

Map: 100%|██████████| 100000/100000 [00:02<00:00, 42319.23 examples/s]
Map: 100%|██████████| 7506/7506 [00:00<00:00, 41555.01 examples/s]


In [22]:
# Shuffle the combined dataset
full_dataset = full_dataset.shuffle(seed=42)

# Split into training and validation sets (e.g., 90% train, 10% validation)
split_dataset = full_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

In [23]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(example):
    return tokenizer(example['conversation'], padding='max_length', truncation=True)

# Apply the tokenizer to the datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 96755/96755 [00:16<00:00, 5707.94 examples/s]
Map: 100%|██████████| 10751/10751 [00:01<00:00, 6117.09 examples/s]


In [24]:
# Load a pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args = TrainingArguments(
    output_dir='./results',            # Output directory
    evaluation_strategy='steps',       # Evaluate every N steps
    save_strategy='steps',             # Save the model every N steps
    eval_steps=500,                    # Evaluation interval
    save_steps=500,                    # Save interval
    num_train_epochs=3,                # Number of training epochs
    per_device_train_batch_size=16,    # Batch size for training
    per_device_eval_batch_size=64,     # Batch size for evaluation
    logging_dir='./logs',              # Directory for logs
    logging_steps=10,
    load_best_model_at_end=True,       # Load the best model when finished training
)

In [27]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

  0%|          | 0/20160 [17:50<?, ?it/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  0%|          | 10/18144 [00:03<1:42:25,  2.95it/s]

{'loss': 0.4914, 'grad_norm': 1.5420668125152588, 'learning_rate': 4.997244268077602e-05, 'epoch': 0.0}


  0%|          | 20/18144 [00:07<1:40:04,  3.02it/s]

{'loss': 0.1912, 'grad_norm': 0.4340767562389374, 'learning_rate': 4.994488536155203e-05, 'epoch': 0.0}


  0%|          | 30/18144 [00:10<1:40:13,  3.01it/s]

{'loss': 0.1793, 'grad_norm': 2.67506742477417, 'learning_rate': 4.9917328042328046e-05, 'epoch': 0.0}


  0%|          | 40/18144 [00:13<1:39:43,  3.03it/s]

{'loss': 0.188, 'grad_norm': 1.2345649003982544, 'learning_rate': 4.9889770723104054e-05, 'epoch': 0.01}


  0%|          | 50/18144 [00:17<1:39:43,  3.02it/s]

{'loss': 0.2346, 'grad_norm': 4.998570442199707, 'learning_rate': 4.986221340388007e-05, 'epoch': 0.01}


  0%|          | 60/18144 [00:20<1:40:25,  3.00it/s]

{'loss': 0.0766, 'grad_norm': 0.380318820476532, 'learning_rate': 4.983465608465609e-05, 'epoch': 0.01}


  0%|          | 70/18144 [00:23<1:39:55,  3.01it/s]

{'loss': 0.1738, 'grad_norm': 0.9798763394355774, 'learning_rate': 4.9807098765432105e-05, 'epoch': 0.01}


  0%|          | 80/18144 [00:27<1:40:14,  3.00it/s]

{'loss': 0.1949, 'grad_norm': 1.4682722091674805, 'learning_rate': 4.977954144620812e-05, 'epoch': 0.01}


  0%|          | 90/18144 [00:30<1:40:36,  2.99it/s]

{'loss': 0.1424, 'grad_norm': 1.1334693431854248, 'learning_rate': 4.975198412698413e-05, 'epoch': 0.01}


  1%|          | 100/18144 [00:33<1:40:05,  3.00it/s]

{'loss': 0.1652, 'grad_norm': 0.591254711151123, 'learning_rate': 4.972442680776014e-05, 'epoch': 0.02}


  1%|          | 110/18144 [00:37<1:39:30,  3.02it/s]

{'loss': 0.1594, 'grad_norm': 1.7899677753448486, 'learning_rate': 4.9696869488536156e-05, 'epoch': 0.02}


  1%|          | 120/18144 [00:40<1:39:42,  3.01it/s]

{'loss': 0.1595, 'grad_norm': 0.5316903591156006, 'learning_rate': 4.966931216931217e-05, 'epoch': 0.02}


  1%|          | 130/18144 [00:43<1:38:39,  3.04it/s]

{'loss': 0.1202, 'grad_norm': 0.4756911098957062, 'learning_rate': 4.9641754850088185e-05, 'epoch': 0.02}


  1%|          | 140/18144 [00:47<1:39:48,  3.01it/s]

{'loss': 0.189, 'grad_norm': 1.52963125705719, 'learning_rate': 4.96141975308642e-05, 'epoch': 0.02}


  1%|          | 150/18144 [00:50<1:39:33,  3.01it/s]

{'loss': 0.1142, 'grad_norm': 1.2816309928894043, 'learning_rate': 4.9586640211640215e-05, 'epoch': 0.02}


  1%|          | 160/18144 [00:53<1:39:23,  3.02it/s]

{'loss': 0.1017, 'grad_norm': 0.3544687032699585, 'learning_rate': 4.955908289241622e-05, 'epoch': 0.03}


  1%|          | 170/18144 [00:57<1:39:08,  3.02it/s]

{'loss': 0.0733, 'grad_norm': 1.8661670684814453, 'learning_rate': 4.9531525573192244e-05, 'epoch': 0.03}


  1%|          | 180/18144 [01:00<1:39:55,  3.00it/s]

{'loss': 0.1252, 'grad_norm': 3.476837158203125, 'learning_rate': 4.950396825396826e-05, 'epoch': 0.03}


  1%|          | 190/18144 [01:03<1:39:56,  2.99it/s]

{'loss': 0.1172, 'grad_norm': 0.9506072998046875, 'learning_rate': 4.947641093474427e-05, 'epoch': 0.03}


  1%|          | 200/18144 [01:07<1:39:35,  3.00it/s]

{'loss': 0.1226, 'grad_norm': 0.3627109229564667, 'learning_rate': 4.944885361552029e-05, 'epoch': 0.03}


  1%|          | 210/18144 [01:10<1:39:02,  3.02it/s]

{'loss': 0.1251, 'grad_norm': 0.6595692038536072, 'learning_rate': 4.94212962962963e-05, 'epoch': 0.03}


  1%|          | 220/18144 [01:13<1:39:11,  3.01it/s]

{'loss': 0.1673, 'grad_norm': 0.7772043943405151, 'learning_rate': 4.939373897707231e-05, 'epoch': 0.04}


  1%|▏         | 230/18144 [01:17<1:36:02,  3.11it/s]

{'loss': 0.1158, 'grad_norm': 1.5617953538894653, 'learning_rate': 4.9366181657848325e-05, 'epoch': 0.04}


  1%|▏         | 240/18144 [01:20<1:36:08,  3.10it/s]

{'loss': 0.166, 'grad_norm': 0.7459725141525269, 'learning_rate': 4.933862433862434e-05, 'epoch': 0.04}


  1%|▏         | 250/18144 [01:23<1:37:07,  3.07it/s]

{'loss': 0.0946, 'grad_norm': 1.7090694904327393, 'learning_rate': 4.9311067019400354e-05, 'epoch': 0.04}


  1%|▏         | 260/18144 [01:26<1:38:08,  3.04it/s]

{'loss': 0.1744, 'grad_norm': 3.598007917404175, 'learning_rate': 4.928350970017637e-05, 'epoch': 0.04}


  1%|▏         | 270/18144 [01:30<1:37:47,  3.05it/s]

{'loss': 0.165, 'grad_norm': 0.7249752879142761, 'learning_rate': 4.925595238095238e-05, 'epoch': 0.04}


  2%|▏         | 280/18144 [01:33<1:40:11,  2.97it/s]

{'loss': 0.1242, 'grad_norm': 1.4531028270721436, 'learning_rate': 4.92283950617284e-05, 'epoch': 0.05}


  2%|▏         | 290/18144 [01:37<1:42:23,  2.91it/s]

{'loss': 0.1545, 'grad_norm': 1.647534728050232, 'learning_rate': 4.920083774250441e-05, 'epoch': 0.05}


  2%|▏         | 300/18144 [01:40<1:38:43,  3.01it/s]

{'loss': 0.2106, 'grad_norm': 6.304158687591553, 'learning_rate': 4.917328042328043e-05, 'epoch': 0.05}


  2%|▏         | 310/18144 [01:43<1:41:06,  2.94it/s]

{'loss': 0.1205, 'grad_norm': 1.8017269372940063, 'learning_rate': 4.914572310405644e-05, 'epoch': 0.05}


  2%|▏         | 320/18144 [01:47<1:42:56,  2.89it/s]

{'loss': 0.154, 'grad_norm': 0.6446854472160339, 'learning_rate': 4.9118165784832456e-05, 'epoch': 0.05}


  2%|▏         | 330/18144 [01:50<1:37:52,  3.03it/s]

{'loss': 0.1655, 'grad_norm': 0.5440697073936462, 'learning_rate': 4.909060846560847e-05, 'epoch': 0.05}


  2%|▏         | 340/18144 [01:53<1:37:45,  3.04it/s]

{'loss': 0.1649, 'grad_norm': 0.47195595502853394, 'learning_rate': 4.906305114638448e-05, 'epoch': 0.06}


  2%|▏         | 350/18144 [01:57<1:39:37,  2.98it/s]

{'loss': 0.1739, 'grad_norm': 1.1471905708312988, 'learning_rate': 4.903549382716049e-05, 'epoch': 0.06}


  2%|▏         | 360/18144 [02:00<1:37:55,  3.03it/s]

{'loss': 0.1177, 'grad_norm': 0.5536867380142212, 'learning_rate': 4.900793650793651e-05, 'epoch': 0.06}


  2%|▏         | 370/18144 [02:03<1:40:46,  2.94it/s]

{'loss': 0.1406, 'grad_norm': 0.528218686580658, 'learning_rate': 4.898037918871252e-05, 'epoch': 0.06}


  2%|▏         | 380/18144 [02:07<1:38:32,  3.00it/s]

{'loss': 0.0868, 'grad_norm': 0.3698132336139679, 'learning_rate': 4.8952821869488544e-05, 'epoch': 0.06}


  2%|▏         | 390/18144 [02:10<1:37:41,  3.03it/s]

{'loss': 0.2198, 'grad_norm': 3.347745895385742, 'learning_rate': 4.892526455026455e-05, 'epoch': 0.06}


  2%|▏         | 400/18144 [02:13<1:41:09,  2.92it/s]

{'loss': 0.1971, 'grad_norm': 2.578880548477173, 'learning_rate': 4.8897707231040566e-05, 'epoch': 0.07}


  2%|▏         | 410/18144 [02:17<1:35:30,  3.09it/s]

{'loss': 0.1278, 'grad_norm': 1.1271575689315796, 'learning_rate': 4.887014991181658e-05, 'epoch': 0.07}


  2%|▏         | 420/18144 [02:20<1:36:55,  3.05it/s]

{'loss': 0.2289, 'grad_norm': 2.975691080093384, 'learning_rate': 4.8842592592592595e-05, 'epoch': 0.07}


  2%|▏         | 430/18144 [02:23<1:35:33,  3.09it/s]

{'loss': 0.1914, 'grad_norm': 2.6774182319641113, 'learning_rate': 4.881503527336861e-05, 'epoch': 0.07}


  2%|▏         | 440/18144 [02:26<1:36:05,  3.07it/s]

{'loss': 0.0669, 'grad_norm': 0.48994380235671997, 'learning_rate': 4.8787477954144624e-05, 'epoch': 0.07}


  2%|▏         | 450/18144 [02:30<1:36:17,  3.06it/s]

{'loss': 0.0986, 'grad_norm': 1.7143583297729492, 'learning_rate': 4.875992063492064e-05, 'epoch': 0.07}


  3%|▎         | 460/18144 [02:33<1:36:01,  3.07it/s]

{'loss': 0.1741, 'grad_norm': 1.5850952863693237, 'learning_rate': 4.873236331569665e-05, 'epoch': 0.08}


  3%|▎         | 470/18144 [02:36<1:34:54,  3.10it/s]

{'loss': 0.1449, 'grad_norm': 1.416663408279419, 'learning_rate': 4.870480599647266e-05, 'epoch': 0.08}


  3%|▎         | 480/18144 [02:40<1:36:06,  3.06it/s]

{'loss': 0.1405, 'grad_norm': 1.4757053852081299, 'learning_rate': 4.8677248677248676e-05, 'epoch': 0.08}


  3%|▎         | 490/18144 [02:43<1:35:12,  3.09it/s]

{'loss': 0.0882, 'grad_norm': 0.40558701753616333, 'learning_rate': 4.86496913580247e-05, 'epoch': 0.08}


  3%|▎         | 500/18144 [02:46<1:37:03,  3.03it/s]

{'loss': 0.0806, 'grad_norm': 0.472676157951355, 'learning_rate': 4.862213403880071e-05, 'epoch': 0.08}



  3%|▎         | 500/18144 [03:55<1:37:03,  3.03it/s]

{'eval_loss': 0.1468181610107422, 'eval_runtime': 68.9891, 'eval_samples_per_second': 155.836, 'eval_steps_per_second': 2.435, 'epoch': 0.08}


In [None]:
# Evaluate the model
evaluation_results = trainer.evaluate()

# Print evaluation results
print(evaluation_results)