<a href="https://colab.research.google.com/github/tubagokhan/ADGM/blob/main/TextClassificationWithoutDistinction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install tqdm

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
Co

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load your JSON data
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/COBS_All_Labels_Phrases.json', 'r', encoding='utf-8-sig') as file:
    data = json.load(file)

# Extract labels and phrases
labels = [item["label_"] for item in data]
phrases = [item["phrase"] for item in data]

# Split the data into training, validation, and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    phrases, labels, test_size=0.2, random_state=42
)

# Further split the training data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)

# Load BERT tokenizer and encode the data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

unique_labels = set(labels)
num_classes = len(unique_labels)
print(f"Number of unique classes: {num_classes}")

# Create BERT-based text classification model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)

# Move the model and data to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
train_encodings = {key: value.to(device) for key, value in train_encodings.items()}
val_encodings = {key: value.to(device) for key, value in val_encodings.items()}
test_encodings = {key: value.to(device) for key, value in test_encodings.items()}
train_labels_encoded = torch.tensor(train_labels_encoded, device=device)
val_labels_encoded = torch.tensor(val_labels_encoded, device=device)
test_labels_encoded = torch.tensor(test_labels_encoded, device=device)

# Fine-tuning hyperparameters
batch_size = 64
max_seq_length = 128
learning_rate = 2e-5
num_epochs = 50
gradient_accumulation_steps = 4

# Create data loaders
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_labels_encoded)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Optimization and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Early stopping parameters
patience = 3
best_val_accuracy = 0
no_improvement_count = 0

# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    tqdm_data = tqdm(enumerate(train_loader), total=len(train_loader))
    for step, batch in tqdm_data:
        input_ids, attention_mask, labels = batch
        inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
        outputs = model(**inputs)
        loss = outputs.loss

        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()
        avg_loss = total_loss / (step + 1)

        tqdm_data.set_description(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    print(f"Epoch {epoch + 1} - Average Loss: {total_loss / len(train_loader)}")

    # Evaluation on validation data after each epoch
    model.eval()
    with torch.no_grad():
        val_outputs = model(**val_encodings)
        val_predicted_labels = torch.argmax(val_outputs.logits, dim=1)
        val_accuracy = accuracy_score(val_labels_encoded.cpu().numpy(), val_predicted_labels.cpu().numpy())
        print(f"Validation Accuracy (Epoch {epoch+1}): {val_accuracy:.4f}")
        f1 = f1_score(val_labels_encoded.cpu().numpy(), val_predicted_labels.cpu().numpy(), average='micro')
        print(f"Epoch {epoch + 1} - F1 Score: {f1}")

        # Implement early stopping
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            no_improvement_count = 0

            # Save the model checkpoint with the best validation accuracy
            model_checkpoint_path = '/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/best_model.pt'
            torch.save(model.state_dict(), model_checkpoint_path)
            print("Saved model checkpoint with validation accuracy: {:.4f}".format(val_accuracy))
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print("Early stopping triggered. No improvement for {} epochs.".format(patience))
                break  # Stop training


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Number of unique classes: 9


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/50, Loss: 1.6014: 100%|██████████| 209/209 [00:58<00:00,  3.60it/s]


Epoch 1 - Average Loss: 1.6014152229117435
Validation Accuracy (Epoch 1): 0.5577
Epoch 1 - F1 Score: 0.5577312626603647
Saved model checkpoint with validation accuracy: 0.5577


Epoch 2/50, Loss: 0.9572: 100%|██████████| 209/209 [00:54<00:00,  3.86it/s]


Epoch 2 - Average Loss: 0.9572040649692407
Validation Accuracy (Epoch 2): 0.6320
Epoch 2 - F1 Score: 0.6320054017555705
Saved model checkpoint with validation accuracy: 0.6320


Epoch 3/50, Loss: 0.7477: 100%|██████████| 209/209 [00:54<00:00,  3.86it/s]


Epoch 3 - Average Loss: 0.7476993385684547
Validation Accuracy (Epoch 3): 0.6280
Epoch 3 - F1 Score: 0.6279540850776503


Epoch 4/50, Loss: 0.6428: 100%|██████████| 209/209 [00:54<00:00,  3.86it/s]


Epoch 4 - Average Loss: 0.6428411675697309
Validation Accuracy (Epoch 4): 0.6286
Epoch 4 - F1 Score: 0.6286293045239703


Epoch 5/50, Loss: 0.5708: 100%|██████████| 209/209 [00:54<00:00,  3.86it/s]


Epoch 5 - Average Loss: 0.5708056080854681
Validation Accuracy (Epoch 5): 0.6529
Epoch 5 - F1 Score: 0.6529372045914922
Saved model checkpoint with validation accuracy: 0.6529


Epoch 6/50, Loss: 0.5308: 100%|██████████| 209/209 [00:54<00:00,  3.85it/s]


Epoch 6 - Average Loss: 0.5308143363900162
Validation Accuracy (Epoch 6): 0.6489
Epoch 6 - F1 Score: 0.6488858879135719


Epoch 7/50, Loss: 0.5081: 100%|██████████| 209/209 [00:54<00:00,  3.85it/s]


Epoch 7 - Average Loss: 0.5081106056436968
Validation Accuracy (Epoch 7): 0.6583
Epoch 7 - F1 Score: 0.6583389601620526
Saved model checkpoint with validation accuracy: 0.6583


Epoch 8/50, Loss: 0.4975: 100%|██████████| 209/209 [00:54<00:00,  3.85it/s]


Epoch 8 - Average Loss: 0.4974905479182467
Validation Accuracy (Epoch 8): 0.6448
Epoch 8 - F1 Score: 0.6448345712356516


Epoch 9/50, Loss: 0.4888: 100%|██████████| 209/209 [00:54<00:00,  3.85it/s]


Epoch 9 - Average Loss: 0.4888076696669656
Validation Accuracy (Epoch 9): 0.6307
Epoch 9 - F1 Score: 0.6306549628629304


Epoch 10/50, Loss: 0.4801: 100%|██████████| 209/209 [00:54<00:00,  3.85it/s]


Epoch 10 - Average Loss: 0.48012341993847535
Validation Accuracy (Epoch 10): 0.6415
Epoch 10 - F1 Score: 0.6414584740040513
Early stopping triggered. No improvement for 3 epochs.


In [4]:
# Load the saved model for evaluation on the GPU
model_checkpoint_path = '/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/best_model.pt'
loaded_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)
loaded_model.load_state_dict(torch.load(model_checkpoint_path))
loaded_model.to(device)  # Move the loaded model to the GPU
loaded_model.eval()  # Set the loaded model in evaluation mode for inference

# Evaluation on test data
from sklearn.metrics import classification_report, f1_score

with torch.no_grad():
    test_outputs = loaded_model(**test_encodings)  # Use loaded_model for inference
    test_predicted_labels = torch.argmax(test_outputs.logits, dim=1)
    test_accuracy = accuracy_score(test_labels_encoded.cpu().numpy(), test_predicted_labels.cpu().numpy())
    print(f"Test Accuracy: {test_accuracy:.4f}")
    f1 = f1_score(test_labels_encoded.cpu().numpy(), test_predicted_labels.cpu().numpy(), average='micro')
    print("Micro F1-Score:", f1)

    # Generate classification report
#class_names = [value for value, number in unique_values_dict.items()]
#print(classification_report(true_labels, predicted_labels, target_names=class_names))


    test_report = classification_report(test_labels_encoded.cpu().numpy(), test_predicted_labels.cpu().numpy())


    print("Test Classification Report:\n", test_report)

    # Value: ACT, Number: 0
    # Value: DEF, Number: 1
    # Value: ENT, Number: 2
    # Value: FS, Number: 3
    # Value: MIT, Number: 4
    # Value: PERM, Number: 5
    # Value: PROD, Number: 6
    # Value: RISK, Number: 7
    # Value: TECH, Number: 8



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.6645
Micro F1-Score: 0.6645056726094003
Test Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.71      0.71       270
           1       0.60      0.74      0.66      1135
           2       0.68      0.24      0.36       598
           3       0.75      0.80      0.78       269
           4       0.81      0.86      0.83       662
           5       0.00      0.00      0.00        18
           6       0.74      0.58      0.65        48
           7       0.67      0.64      0.66       373
           8       0.55      0.69      0.61       329

    accuracy                           0.66      3702
   macro avg       0.61      0.59      0.58      3702
weighted avg       0.67      0.66      0.65      3702



In [None]:
unique_labels


{'ACT', 'DEF', 'ENT', 'FS', 'MIT', 'PERM', 'PROD', 'RISK', 'TECH'}

In [5]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np

# Load the new JSON file containing phrases
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/samplePhrasesandTags100.json', 'r', encoding='utf-8-sig') as json_file:
    data = json.load(json_file)

# Extract phrases and original labels from the new data
phrases = [item["phrase"] for item in data]
original_labels = [item["label_"] for item in data]

# Preprocess the new data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoded_phrases = tokenizer(phrases, padding=True, truncation=True, return_tensors="pt")

# Move the new data to the GPU
encoded_phrases = {key: value.to(device) for key, value in encoded_phrases.items()}

# Use the pre-trained model to predict labels
with torch.no_grad():
    model.eval()
    outputs = model(**encoded_phrases)
    predicted_labels = torch.argmax(outputs.logits, dim=1)

# Define your custom label encoder for the missing label 'PERM'
class CustomLabelEncoder:
    def transform(self, labels):
        return [label if label != 'PERM' else '8' for label in labels]

# Create an instance of your custom label encoder
label_encoder = CustomLabelEncoder()

# Decode the predicted labels
predicted_labels_decoded = label_encoder.transform(predicted_labels)

# Define a reverse label mapping
reverse_label_mapping = {
    '8': 'PERM',
    '0': 'ACT',  # Replace '0' with the corresponding label
    '1': 'DEF',
    '2': 'ENT',
    '3': 'FS',
    '4': 'MIT',
    '5': 'PROD',
    '6': 'RISK',
    '7': 'TECH',# Replace '1' with the corresponding label
    # Add mappings for other labels as needed
}

# Convert numeric labels (tensors) to text labels
predicted_labels_text = [reverse_label_mapping[str(label.item())] for label in predicted_labels]

# Print the original phrases and their predicted labels
for phrase, original_label, predicted_label, predicted_label_value in zip(phrases, original_labels, predicted_labels_text, predicted_labels.tolist()):
    print(f"Phrase: {phrase}")
    print(f"KG Label: {original_label}")
    print(f"Predicted Label: {predicted_label}")
    print()

Phrase: indefinite
KG Label: FS
Predicted Label: TECH

Phrase: Executive/CE
KG Label: MIT
Predicted Label: ENT

Phrase: Illustrative Regulatory Framework
KG Label: TECH
Predicted Label: PERM

Phrase: might take in the medium term (i.e. five to ten years
KG Label: MIT
Predicted Label: MIT

Phrase: We propose to include a section in the concept paper that describes a future
KG Label: MIT
Predicted Label: MIT

Phrase: an example of a DeFi insurance protocol
KG Label: RISK
Predicted Label: MIT

Phrase: except that this tends to focus on risks specific to DeFi such as the risk of a hack or of a failure in a smart contract
KG Label: MIT
Predicted Label: TECH

Phrase: insurance
KG Label: FS
Predicted Label: FS

Phrase: hack
KG Label: ACT
Predicted Label: DEF

Phrase: market
KG Label: FS
Predicted Label: FS

Phrase: regulators
KG Label: DEF
Predicted Label: ENT

Phrase: regime from the industry and other regulators to test
KG Label: MIT
Predicted Label: MIT

Phrase: We would invite comment
KG 