In [None]:
import pandas as pd
import torch
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW

import warnings
from sklearn.exceptions import UndefinedMetricWarning

In [None]:
# Load the dataset
data = pd.read_csv('txsUnique_1_2_balanced_dataset.csv')  # Replace with the path to your dataset file

# Extract the transaction descriptions and labels
texts = data['X'].tolist()
labels = data['y2'].tolist()

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Get the number of unique labels
num_labels = 32

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the texts and convert them to input tensors
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Convert the labels to tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create the DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(set(labels)))

# Create a DataLoader for the training data
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

# Set up the optimizer and the learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 10)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

for epoch in range(10):  # Adjust the number of training epochs as needed
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, batch_labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        batch_labels = batch_labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    print(f"Epoch {epoch+1} - Average Loss: {total_loss / len(train_loader)}")

# Save the trained model weights
torch.save(model.state_dict(), 'model_weights.pth')

# Evaluation
model.eval()
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}
with torch.no_grad():
    outputs = model(**test_encodings)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

# Convert the predicted labels and ground truth labels to CPU
predictions = predictions.cpu().numpy()
test_labels = test_labels.numpy()

# Ignore UndefinedMetricWarning when calling classification_report
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    report = classification_report(test_labels, predictions, zero_division=1)

print(report)
