<a href="https://colab.research.google.com/github/tubagokhan/ADGM/blob/main/SentenceLabelClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install tqdm
!pip install pytorchtools



In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import json
from sklearn.model_selection import train_test_split

# Use a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a pre-trained BERT model and tokenizer and move them to GPU
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=9).to(device)

# Load your JSON data
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/COBS_ParagraphsPhrasesTags.json', 'r', encoding='utf-8-sig') as file:
    data = json.load(file)

sentences = [item["text"] for item in data]
phrases = [item["phrase"] for item in data]
labels = [item["t.ttype"] for item in data]

# Initialize a dictionary to store unique values and their corresponding numbers
unique_values_dict = {}
current_number = 0

# Iterate through the list to assign numbers to unique values
unique_values = []

for value in labels:
    if value not in unique_values_dict:
        unique_values_dict[value] = current_number
        current_number += 1
    unique_values.append(unique_values_dict[value])

# Print the unique values and their corresponding numbers
for value, number in unique_values_dict.items():
    print(f"Value: {value}, Number: {number}")

numerical_labels = [unique_values_dict[label] for label in labels]

# Tokenize and convert data to model input format
input_ids = []
attention_masks = []

for sentence, phrase in zip(sentences, phrases):
    encoded_dict = tokenizer.encode_plus(
        phrase,
        sentence,
        add_special_tokens=True,
        max_length=256,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt",
        truncation=True,
    )
    input_ids.append(encoded_dict["input_ids"])
    attention_masks.append(encoded_dict["attention_mask"])

# Convert the lists to PyTorch tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(numerical_labels)

# Split the data into training, validation, and test sets
train_inputs, temp_inputs, train_masks, temp_masks, train_labels, temp_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.4, random_state=42
)
val_inputs, test_inputs, val_masks, test_masks, val_labels, test_labels = train_test_split(
    temp_inputs, temp_masks, temp_labels, test_size=0.5, random_state=42
)

# Create data loaders
batch_size = 64
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Set up optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Initialize early stopping variables
best_val_loss = float('inf')
patience = 3
no_improvement_count = 0

# Training loop
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Use tqdm for batch progress
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_masks,
            labels=batch_labels
        )
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

    # Validation
    model.eval()
    total_val_loss = 0

    for batch in val_dataloader:
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_masks,
                labels=batch_labels
            )
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Validation Loss: {avg_val_loss}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improvement_count = 0
    else:
        no_improvement_count += 1

    if no_improvement_count >= patience:
        print("Early stopping triggered")
        break


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Value: DEF, Number: 0
Value: RISK, Number: 1
Value: MIT, Number: 2
Value: ACT, Number: 3
Value: ENT, Number: 4
Value: TECH, Number: 5
Value: FS, Number: 6
Value: PERM, Number: 7
Value: PROD, Number: 8


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 1/50, Average Training Loss: 1.3723881984579152
Validation Loss: 0.973814969432765


Epoch 2/50: 100%|██████████| 174/174 [44:53<00:00, 15.48s/it]


Epoch 2/50, Average Training Loss: 0.8686003595933147
Validation Loss: 0.7734538070086775


Epoch 3/50: 100%|██████████| 174/174 [45:41<00:00, 15.76s/it]


Epoch 3/50, Average Training Loss: 0.7098522116055433
Validation Loss: 0.7363370831670433


Epoch 4/50: 100%|██████████| 174/174 [45:12<00:00, 15.59s/it]


Epoch 4/50, Average Training Loss: 0.6306243179173305
Validation Loss: 0.7284580027234966


Epoch 5/50: 100%|██████████| 174/174 [45:31<00:00, 15.70s/it]


Epoch 5/50, Average Training Loss: 0.5698804065756414
Validation Loss: 0.7620654599419956


Epoch 6/50: 100%|██████████| 174/174 [45:02<00:00, 15.53s/it]


Epoch 6/50, Average Training Loss: 0.5311671903078583
Validation Loss: 0.7933684361392054


Epoch 7/50: 100%|██████████| 174/174 [45:33<00:00, 15.71s/it]


Epoch 7/50, Average Training Loss: 0.501144253316967
Validation Loss: 0.8186538907988318
Early stopping triggered


In [3]:
from sklearn.metrics import accuracy_score,classification_report

# Testing
model.eval()
total_test_loss = 0
predicted_labels = []
true_labels = []

for batch in test_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_masks,
            labels=batch_labels
        )
        loss = outputs.loss
        total_test_loss += loss.item()

        logits = outputs.logits
        predicted_labels.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(batch_labels.tolist())

avg_test_loss = total_test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}")

# Calculate test accuracy
test_accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Test Accuracy: {test_accuracy}")

# Generate classification report
class_names = [value for value, number in unique_values_dict.items()]
print(classification_report(true_labels, predicted_labels, target_names=class_names))

# Save the trained model
model.save_pretrained("/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/bert_phrase_classification_model")

Test Loss: 0.809679130028034
Test Accuracy: 0.6204754186925986
              precision    recall  f1-score   support

         DEF       0.56      0.72      0.63      1174
        RISK       0.65      0.61      0.63       371
         MIT       0.83      0.76      0.79       673
         ACT       0.65      0.67      0.66       256
         ENT       0.52      0.27      0.35       567
        TECH       0.49      0.54      0.51       312
          FS       0.71      0.67      0.69       283
        PERM       0.25      0.26      0.26        19
        PROD       0.66      0.45      0.53        47

    accuracy                           0.62      3702
   macro avg       0.59      0.55      0.56      3702
weighted avg       0.62      0.62      0.61      3702



In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import json
import os

# Use a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the saved model and tokenizer
model_dir = "/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/bert_phrase_classification_model"  # Path to the saved model directory

# Check if the directory exists
if not os.path.exists(model_dir):
    print(f"Directory '{model_dir}' does not exist.")
    exit()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(model_dir).to(device)

# Load your JSON data
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/COBS_ParagraphsPhrasesTagsSample100.json', 'r', encoding='utf-8-sig') as file:
    data = json.load(file)

# Prepare data for inference
sentences = [item["text"] for item in data]
phrases = [item["phrase"] for item in data]
original_labels = [item["t.ttype"] for item in data]

# Tokenize and convert data to model input format
input_ids = []
attention_masks = []

for sentence, phrase in zip(sentences, phrases):
    encoded_dict = tokenizer.encode_plus(
        phrase,
        sentence,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt",
        truncation=True,
    )
    input_ids.append(encoded_dict["input_ids"])
    attention_masks.append(encoded_dict["attention_mask"])

# Convert the lists to PyTorch tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Inference
model.eval()
predicted_labels = []

for i in range(len(input_ids)):
    input_id = input_ids[i].to(device).unsqueeze(0)
    attention_mask = attention_masks[i].to(device).unsqueeze(0)

    labelValues=['DEF', 'RISK', 'MIT', 'ACT', 'ENT', 'TECH', 'FS', 'PERM', 'PROD']
    with torch.no_grad():
        outputs = model(input_ids=input_id, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()
        predicted_labels.append(labelValues[predicted_label])

# Print the original labels and predicted labels
for i, (original, predicted) in enumerate(zip(original_labels, predicted_labels)):
    print(f"Example {i + 1}:")
    print(f"Sentence: {sentences[i]}")
    print(f"Phrase: {phrases[i]}")
    print(f"Original Label: {original}")
    print(f"Predicted Label: {predicted}\n")




Example 1:
Sentence: $5,000 for each Cell Company; and 
Phrase: Cell
Original Label: DEF
Predicted Label: ENT

Example 2:
Sentence: $5,000 for each Cell Company; and 
Phrase: $5,000
Original Label: MIT
Predicted Label: MIT

Example 3:
Sentence: $5,000 for each Cell Company; and 
Phrase: Cell Company
Original Label: ENT
Predicted Label: ENT

Example 4:
Sentence: Applicants referred to in Rule 3.2.1 must also pay an annual supervision fee of $30,000 and an additional annual supervision fee of $5,000 for each additional Regulated Activity for which it has obtained a Financial Services Permission. 
Phrase: Regulated Activity
Original Label: DEF
Predicted Label: ACT

Example 5:
Sentence: Applicants referred to in Rule 3.2.1 must also pay an annual supervision fee of $30,000 and an additional annual supervision fee of $5,000 for each additional Regulated Activity for which it has obtained a Financial Services Permission. 
Phrase: fee
Original Label: DEF
Predicted Label: DEF

Example 6:
Sente