<a href="https://colab.research.google.com/github/tubagokhan/ADGM/blob/main/TextClassificationWithoutDistinctionWithLemma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install tqdm



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load your JSON data
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/COBS_All_Labels_PhraseswithLemma.json', 'r', encoding='utf-8-sig') as file:
    data = json.load(file)

# Extract labels and phrases
labels = [item["label_"] for item in data]
phrases = [item["phrase"] for item in data]

# Split the data into training, validation, and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    phrases, labels, test_size=0.2, random_state=42
)

# Further split the training data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)

# Load BERT tokenizer and encode the data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

unique_labels = set(labels)
num_classes = len(unique_labels)
print(f"Number of unique classes: {num_classes}")

# Create BERT-based text classification model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)

# Move the model and data to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
train_encodings = {key: value.to(device) for key, value in train_encodings.items()}
val_encodings = {key: value.to(device) for key, value in val_encodings.items()}
test_encodings = {key: value.to(device) for key, value in test_encodings.items()}
train_labels_encoded = torch.tensor(train_labels_encoded, device=device)
val_labels_encoded = torch.tensor(val_labels_encoded, device=device)
test_labels_encoded = torch.tensor(test_labels_encoded, device=device)

# Fine-tuning hyperparameters
batch_size = 64
max_seq_length = 128
learning_rate = 2e-5
num_epochs = 50
gradient_accumulation_steps = 4

# Create data loaders
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_labels_encoded)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Optimization and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Early stopping parameters
patience = 3
best_val_accuracy = 0
no_improvement_count = 0

# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    tqdm_data = tqdm(enumerate(train_loader), total=len(train_loader))
    for step, batch in tqdm_data:
        input_ids, attention_mask, labels = batch
        inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
        outputs = model(**inputs)
        loss = outputs.loss

        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()
        avg_loss = total_loss / (step + 1)

        tqdm_data.set_description(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    print(f"Epoch {epoch + 1} - Average Loss: {total_loss / len(train_loader)}")

    # Evaluation on validation data after each epoch
    model.eval()
    with torch.no_grad():
        val_outputs = model(**val_encodings)
        val_predicted_labels = torch.argmax(val_outputs.logits, dim=1)
        val_accuracy = accuracy_score(val_labels_encoded.cpu().numpy(), val_predicted_labels.cpu().numpy())
        print(f"Validation Accuracy (Epoch {epoch+1}): {val_accuracy:.4f}")
        f1 = f1_score(val_labels_encoded.cpu().numpy(), val_predicted_labels.cpu().numpy(), average='micro')
        print(f"Epoch {epoch + 1} - F1 Score: {f1}")

        # Implement early stopping
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            no_improvement_count = 0

            # Save the model checkpoint with the best validation accuracy
            model_checkpoint_path = '/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/best_model.pt'
            torch.save(model.state_dict(), model_checkpoint_path)
            print("Saved model checkpoint with validation accuracy: {:.4f}".format(val_accuracy))
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print("Early stopping triggered. No improvement for {} epochs.".format(patience))
                break  # Stop training


Number of unique classes: 9


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/50, Loss: 1.6251: 100%|██████████| 209/209 [00:39<00:00,  5.23it/s]


Epoch 1 - Average Loss: 1.6250700625506314
Validation Accuracy (Epoch 1): 0.5321
Epoch 1 - F1 Score: 0.5320729237002025
Saved model checkpoint with validation accuracy: 0.5321


Epoch 2/50, Loss: 1.0302: 100%|██████████| 209/209 [00:39<00:00,  5.34it/s]


Epoch 2 - Average Loss: 1.0301906857193943
Validation Accuracy (Epoch 2): 0.5901
Epoch 2 - F1 Score: 0.5901417960837272
Saved model checkpoint with validation accuracy: 0.5901


Epoch 3/50, Loss: 0.8330: 100%|██████████| 209/209 [00:39<00:00,  5.34it/s]


Epoch 3 - Average Loss: 0.8330413522332479
Validation Accuracy (Epoch 3): 0.5949
Epoch 3 - F1 Score: 0.5948683322079676
Saved model checkpoint with validation accuracy: 0.5949


Epoch 4/50, Loss: 0.7219: 100%|██████████| 209/209 [00:39<00:00,  5.33it/s]


Epoch 4 - Average Loss: 0.7218837404365175
Validation Accuracy (Epoch 4): 0.6144
Epoch 4 - F1 Score: 0.6144496961512491
Saved model checkpoint with validation accuracy: 0.6144


Epoch 5/50, Loss: 0.6475: 100%|██████████| 209/209 [00:39<00:00,  5.33it/s]


Epoch 5 - Average Loss: 0.6474618602312353
Validation Accuracy (Epoch 5): 0.6205
Epoch 5 - F1 Score: 0.6205266711681297
Saved model checkpoint with validation accuracy: 0.6205


Epoch 6/50, Loss: 0.6056: 100%|██████████| 209/209 [00:39<00:00,  5.33it/s]


Epoch 6 - Average Loss: 0.605626612473903
Validation Accuracy (Epoch 6): 0.6158
Epoch 6 - F1 Score: 0.6158001350438893


Epoch 7/50, Loss: 0.5814: 100%|██████████| 209/209 [00:39<00:00,  5.33it/s]


Epoch 7 - Average Loss: 0.5814020284340142
Validation Accuracy (Epoch 7): 0.6111
Epoch 7 - F1 Score: 0.6110735989196489


Epoch 8/50, Loss: 0.5715: 100%|██████████| 209/209 [00:39<00:00,  5.33it/s]


Epoch 8 - Average Loss: 0.5715060583427192
Validation Accuracy (Epoch 8): 0.6172
Epoch 8 - F1 Score: 0.6171505739365294
Early stopping triggered. No improvement for 3 epochs.


In [4]:
# Load the saved model for evaluation on the GPU
loaded_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)
loaded_model.load_state_dict(torch.load(model_checkpoint_path))
loaded_model.to(device)  # Move the loaded model to the GPU
loaded_model.eval()  # Set the loaded model in evaluation mode for inference

# Evaluation on test data
from sklearn.metrics import classification_report, f1_score

with torch.no_grad():
    test_outputs = loaded_model(**test_encodings)  # Use loaded_model for inference
    test_predicted_labels = torch.argmax(test_outputs.logits, dim=1)
    test_accuracy = accuracy_score(test_labels_encoded.cpu().numpy(), test_predicted_labels.cpu().numpy())
    print(f"Test Accuracy: {test_accuracy:.4f}")
    f1 = f1_score(test_labels_encoded.cpu().numpy(), test_predicted_labels.cpu().numpy(), average='micro')
    print("Micro F1-Score:", f1)

    test_report = classification_report(test_labels_encoded.cpu().numpy(), test_predicted_labels.cpu().numpy())
    print("Test Classification Report:\n", test_report)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.6189
Micro F1-Score: 0.6188546731496488
Test Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.71      0.66       270
           1       0.55      0.82      0.66      1135
           2       0.77      0.12      0.20       598
           3       0.75      0.67      0.70       269
           4       0.79      0.66      0.72       662
           5       0.00      0.00      0.00        18
           6       0.65      0.65      0.65        48
           7       0.58      0.68      0.62       373
           8       0.58      0.60      0.59       329

    accuracy                           0.62      3702
   macro avg       0.59      0.54      0.53      3702
weighted avg       0.65      0.62      0.59      3702



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
unique_labels


{'ACT', 'DEF', 'ENT', 'FS', 'MIT', 'PERM', 'PROD', 'RISK', 'TECH'}

In [6]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np

# Load the new JSON file containing phrases
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/samplePhrasesandTags100.json', 'r', encoding='utf-8-sig') as json_file:
    data = json.load(json_file)

# Extract phrases and original labels from the new data
phrases = [item["phrase"] for item in data]
original_labels = [item["label_"] for item in data]

# Preprocess the new data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoded_phrases = tokenizer(phrases, padding=True, truncation=True, return_tensors="pt")

# Move the new data to the GPU
encoded_phrases = {key: value.to(device) for key, value in encoded_phrases.items()}

# Use the pre-trained model to predict labels
with torch.no_grad():
    model.eval()
    outputs = model(**encoded_phrases)
    predicted_labels = torch.argmax(outputs.logits, dim=1)

# Define your custom label encoder for the missing label 'PERM'
class CustomLabelEncoder:
    def transform(self, labels):
        return [label if label != 'PERM' else '8' for label in labels]

# Create an instance of your custom label encoder
label_encoder = CustomLabelEncoder()

# Decode the predicted labels
predicted_labels_decoded = label_encoder.transform(predicted_labels)

# Define a reverse label mapping
reverse_label_mapping = {
    '8': 'PERM',
    '0': 'ACT',  # Replace '0' with the corresponding label
    '1': 'DEF',
    '2': 'ENT',
    '3': 'FS',
    '4': 'MIT',
    '5': 'PROD',
    '6': 'RISK',
    '7': 'TECH',# Replace '1' with the corresponding label
    # Add mappings for other labels as needed
}

# Convert numeric labels (tensors) to text labels
predicted_labels_text = [reverse_label_mapping[str(label.item())] for label in predicted_labels]

# Print the original phrases and their predicted labels
for phrase, original_label, predicted_label, predicted_label_value in zip(phrases, original_labels, predicted_labels_text, predicted_labels.tolist()):
    print(f"Phrase: {phrase}")
    print(f"KG Label: {original_label}")
    print(f"Predicted Label: {predicted_label}")
    print()

Phrase: indefinite
KG Label: FS
Predicted Label: TECH

Phrase: Executive/CE
KG Label: MIT
Predicted Label: DEF

Phrase: Illustrative Regulatory Framework
KG Label: TECH
Predicted Label: TECH

Phrase: might take in the medium term (i.e. five to ten years
KG Label: MIT
Predicted Label: TECH

Phrase: We propose to include a section in the concept paper that describes a future
KG Label: MIT
Predicted Label: MIT

Phrase: an example of a DeFi insurance protocol
KG Label: RISK
Predicted Label: MIT

Phrase: except that this tends to focus on risks specific to DeFi such as the risk of a hack or of a failure in a smart contract
KG Label: MIT
Predicted Label: TECH

Phrase: insurance
KG Label: FS
Predicted Label: FS

Phrase: hack
KG Label: ACT
Predicted Label: PERM

Phrase: market
KG Label: FS
Predicted Label: FS

Phrase: regulators
KG Label: DEF
Predicted Label: ENT

Phrase: regime from the industry and other regulators to test
KG Label: MIT
Predicted Label: MIT

Phrase: We would invite comment
K