<a href="https://colab.research.google.com/github/tubagokhan/ADGM/blob/main/MultilabelClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install tqdm

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json
import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define the label types
label_types = ["PERM", "DEF", "RISK", "MIT", "ENT", "ACT", "FS", "PROD", "TECH"]

# Load your JSON data
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/records.json', 'r' , encoding='utf-8-sig') as json_file:
    data = json.load(json_file)

# Prepare your dataset
texts = []  # Store text data
labels = []  # Store binary-encoded labels

# Convert label data into binary format
for item in data:
    text = item["Paragraph"]
    text = text.lower()  # Convert text to lowercase
    tags = item["Tags"]
    label = [0] * len(label_types)

    for tag in tags:
        tag_type = tag["Type"]
        if tag_type in label_types:
            label[label_types.index(tag_type)] = 1

    texts.append(text)
    labels.append(label)

# Split the dataset into training and testing
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the text data
max_seq_length = 128  # Adjust as needed
train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_seq_length, return_tensors='pt', return_attention_mask=True)
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_seq_length, return_tensors='pt', return_attention_mask=True)

# Create PyTorch Dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, torch.tensor(train_labels, dtype=torch.float32))
test_dataset = CustomDataset(test_encodings, torch.tensor(test_labels, dtype=torch.float32))

# Now you can use train_dataset and test_dataset to train and evaluate your BERT-based multi-label classification model.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
import numpy as np
from tqdm import tqdm  # Import tqdm for the progress bar

# Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_types))

# Define hyperparameters
batch_size = 64  # Change the batch size
learning_rate = 2e-5
num_epochs = 50  # Increase the number of epochs
early_stop_patience = 3  # Number of epochs to wait for improvement before early stopping

# Initialize optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

best_f1_score = 0.0
early_stop_counter = 0

# Training loop with early stopping and progress bar
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch + 1}")
    for i, batch in progress_bar:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix({'Loss': total_loss / (i + 1)})  # Update the progress bar

    print(f"Epoch {epoch + 1} - Average Loss: {total_loss / len(train_loader)}")

    # Evaluate the model and calculate F1 score on the validation set
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.sigmoid(logits).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    f1 = f1_score(np.array(true_labels), np.array(predictions) > 0.5, average='micro')

    print(f"Epoch {epoch + 1} - F1 Score: {f1}")

    # Check for early stopping
    if f1 > best_f1_score:
        best_f1_score = f1
        early_stop_counter = 0
    else:
        early_stop_counter += 1
        if early_stop_counter >= early_stop_patience:
            print(f"Early stopping at epoch {epoch + 1} as F1 score did not improve.")
            break


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 20/20 [04:19<00:00, 12.96s/it, Loss=0.584]


Epoch 1 - Average Loss: 0.5840061217546463
Epoch 1 - F1 Score: 0.7408440878967563


Epoch 2: 100%|██████████| 20/20 [04:02<00:00, 12.11s/it, Loss=0.516]


Epoch 2 - Average Loss: 0.5160444915294647
Epoch 2 - F1 Score: 0.7834843907351461


Epoch 3: 100%|██████████| 20/20 [04:02<00:00, 12.12s/it, Loss=0.481]


Epoch 3 - Average Loss: 0.48060659170150755
Epoch 3 - F1 Score: 0.8186226964112512


Epoch 4: 100%|██████████| 20/20 [03:59<00:00, 11.96s/it, Loss=0.448]


Epoch 4 - Average Loss: 0.44779098927974703
Epoch 4 - F1 Score: 0.8305400372439479


Epoch 5: 100%|██████████| 20/20 [04:03<00:00, 12.16s/it, Loss=0.417]


Epoch 5 - Average Loss: 0.4168806418776512
Epoch 5 - F1 Score: 0.840616966580977


Epoch 6: 100%|██████████| 20/20 [03:57<00:00, 11.90s/it, Loss=0.394]


Epoch 6 - Average Loss: 0.39377537220716474
Epoch 6 - F1 Score: 0.8449059052563271


Epoch 7: 100%|██████████| 20/20 [03:56<00:00, 11.84s/it, Loss=0.369]


Epoch 7 - Average Loss: 0.3689889833331108
Epoch 7 - F1 Score: 0.8536745406824147


Epoch 8: 100%|██████████| 20/20 [03:56<00:00, 11.80s/it, Loss=0.348]


Epoch 8 - Average Loss: 0.34754258543252947
Epoch 8 - F1 Score: 0.8581150051037768


Epoch 9: 100%|██████████| 20/20 [04:01<00:00, 12.09s/it, Loss=0.324]


Epoch 9 - Average Loss: 0.32368195950984957
Epoch 9 - F1 Score: 0.8689839572192513


Epoch 10: 100%|██████████| 20/20 [04:01<00:00, 12.08s/it, Loss=0.301]


Epoch 10 - Average Loss: 0.30130247473716737
Epoch 10 - F1 Score: 0.8749172733289212


Epoch 11: 100%|██████████| 20/20 [04:04<00:00, 12.23s/it, Loss=0.28]


Epoch 11 - Average Loss: 0.27951791882514954
Epoch 11 - F1 Score: 0.8784604996623903


Epoch 12: 100%|██████████| 20/20 [04:01<00:00, 12.08s/it, Loss=0.26]


Epoch 12 - Average Loss: 0.26001131236553193
Epoch 12 - F1 Score: 0.8850383205598134


Epoch 13: 100%|██████████| 20/20 [04:04<00:00, 12.20s/it, Loss=0.234]


Epoch 13 - Average Loss: 0.23449786901473998
Epoch 13 - F1 Score: 0.8871981475355607


Epoch 14: 100%|██████████| 20/20 [04:06<00:00, 12.30s/it, Loss=0.219]


Epoch 14 - Average Loss: 0.21894494742155074
Epoch 14 - F1 Score: 0.8947885939036381


Epoch 15: 100%|██████████| 20/20 [03:59<00:00, 11.99s/it, Loss=0.198]


Epoch 15 - Average Loss: 0.19802170842885972
Epoch 15 - F1 Score: 0.890909090909091


Epoch 16: 100%|██████████| 20/20 [04:01<00:00, 12.08s/it, Loss=0.181]


Epoch 16 - Average Loss: 0.18056950271129607
Epoch 16 - F1 Score: 0.893687707641196


Epoch 17: 100%|██████████| 20/20 [04:00<00:00, 12.03s/it, Loss=0.166]


Epoch 17 - Average Loss: 0.16635391786694526
Epoch 17 - F1 Score: 0.9004300363876943


Epoch 18: 100%|██████████| 20/20 [04:00<00:00, 12.05s/it, Loss=0.152]


Epoch 18 - Average Loss: 0.15166711807250977
Epoch 18 - F1 Score: 0.8947191389169189


Epoch 19: 100%|██████████| 20/20 [03:59<00:00, 11.96s/it, Loss=0.138]


Epoch 19 - Average Loss: 0.13756681084632874
Epoch 19 - F1 Score: 0.8990514905149051


Epoch 20: 100%|██████████| 20/20 [03:56<00:00, 11.84s/it, Loss=0.125]


Epoch 20 - Average Loss: 0.12485768459737301
Epoch 20 - F1 Score: 0.8972900635664102
Early stopping at epoch 20 as F1 score did not improve.


In [5]:
# Evaluation
model.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels.extend(torch.sigmoid(logits).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

predicted_labels = np.array(predicted_labels)
true_labels = np.array(true_labels)

# Compute F1-score for each label
f1_scores = f1_score(true_labels, (predicted_labels > 0.5), average='micro')
print("Micro F1-Score:", f1_scores)


Micro F1-Score: 0.8972900635664102


In [7]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Input sentence
sentence = "the aggregate of each of the following in, or relating to, the Clients portfolio at the close of business on the valuation date."
# Tokenize the sentence
tokens = nltk.word_tokenize(sentence)

# Perform part-of-speech tagging
pos_tags = nltk.pos_tag(tokens)

# Create a grammar for phrase structure parsing
grammar = r"""
    NP: {<DT>?<JJ>*<NN>}
    PP: {<IN><NP>}
    VP: {<VB.*><NP|PP|CLAUSE>+$}
    CLAUSE: {<NP><VP>}
"""
chunk_parser = nltk.RegexpParser(grammar)

# Parse the sentence
tree = chunk_parser.parse(pos_tags)

# Initialize a list to store the phrases
phrases = []

# Define a function to extract phrases from the tree
def extract_phrases(t):
    if isinstance(t, nltk.Tree):
        phrase = " ".join([word for word, tag in t.leaves()])
        phrases.append(phrase)
        for subtree in t:
            extract_phrases(subtree)

# Extract phrases from the tree
extract_phrases(tree)

# Print the extracted phrases
#for phrase in phrases:
#    print(phrase)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
import torch

for phrase in phrases:
  # Tokenize the input sentence
  #input_sentence = "the aggregate of each of the following in, or relating to, the Clients portfolio at the close of business on the valuation date."
  tokenized_input = tokenizer(phrase, truncation=True, padding='max_length', max_length=max_seq_length, return_tensors='pt', return_attention_mask=True)

  # Pass the tokenized input through the model
  model.eval()
  with torch.no_grad():
      output = model(**tokenized_input)

  # Interpret the model's output
  logits = output.logits
  predicted_labels = torch.sigmoid(logits).cpu().numpy()[0]  # Assuming you have a single input sentence

  # Identify the predicted phrases and labels
  predicted_phrases_and_labels = []
  for i, label_type in enumerate(label_types):
      if predicted_labels[i] > 0.5:
          predicted_phrases_and_labels.append({
              "Type": label_type,
              "Phrase": phrase  # In this example, we assume the entire sentence is the phrase
          })

  # Display the predicted phrases and labels
  print("Predicted Phrases and Labels:")
  for item in predicted_phrases_and_labels:
      print(f"Type: {item['Type']}, Phrase: {item['Phrase']}")
  print(" ")


Predicted Phrases and Labels:
Type: DEF, Phrase: the aggregate of each of the following in , or relating to , the Clients portfolio at the close of business on the valuation date .
Type: RISK, Phrase: the aggregate of each of the following in , or relating to , the Clients portfolio at the close of business on the valuation date .
Type: MIT, Phrase: the aggregate of each of the following in , or relating to , the Clients portfolio at the close of business on the valuation date .
Type: ENT, Phrase: the aggregate of each of the following in , or relating to , the Clients portfolio at the close of business on the valuation date .
Type: ACT, Phrase: the aggregate of each of the following in , or relating to , the Clients portfolio at the close of business on the valuation date .
Type: FS, Phrase: the aggregate of each of the following in , or relating to , the Clients portfolio at the close of business on the valuation date .
 
Predicted Phrases and Labels:
Type: MIT, Phrase: the aggregate

In [9]:
# Print the extracted phrases
for phrase in phrases:
    print(phrase)

the aggregate of each of the following in , or relating to , the Clients portfolio at the close of business on the valuation date .
the aggregate
of the following
the following
portfolio
at the close
the close
of business
business
on the valuation
the valuation
date


In [42]:
import torch

# Read the JSON file
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/samplePhrasesandTags100.json', 'r', encoding='utf-8-sig') as json_file:
    sample_data = json.load(json_file)

# Extract values from the JSON data and populate the lists
SamplePhrases = [data['phrase'] for data in sample_data]
SampleTags = [data['label_'] for data in sample_data]


for t in range(len(SamplePhrases)):
  print(("KG: " +SampleTags[t]))
  # Tokenize the input sentence
  #input_sentence = "the aggregate of each of the following in, or relating to, the Clients portfolio at the close of business on the valuation date."
  tokenized_input = tokenizer(SamplePhrases[t], truncation=True, padding='max_length', max_length=max_seq_length, return_tensors='pt', return_attention_mask=True)

  # Pass the tokenized input through the model
  model.eval()
  with torch.no_grad():
      output = model(**tokenized_input)

  # Interpret the model's output
  logits = output.logits
  predicted_labels = torch.sigmoid(logits).cpu().numpy()[0]


    # Find the index of the label type with the highest prediction
  highest_label_index = np.argmax(predicted_labels)

  # Save the related data for the highest label
  predicted_phrases_and_labels = [{
      "Type": label_types[highest_label_index],
      "Phrase": SamplePhrases[t],
      "Score":predicted_labels[highest_label_index]
  }]

  # Display the predicted phrases and labels
  prediction = predicted_phrases_and_labels[0]
  print(f"Type: {prediction['Type']}, Phrase: {prediction['Phrase']}")
  print(f"Score: {prediction['Score']}")
  print(" ")


KG: FS
Type: DEF, Phrase: indefinite
Score: 0.2704387307167053
 
KG: MIT
Type: DEF, Phrase: Executive/CE
Score: 0.7113099694252014
 
KG: TECH
Type: RISK, Phrase: Illustrative Regulatory Framework
Score: 0.7245135307312012
 
KG: MIT
Type: MIT, Phrase: might take in the medium term (i.e. five to ten years
Score: 0.7386154532432556
 
KG: MIT
Type: MIT, Phrase: We propose to include a section in the concept paper that describes a future
Score: 0.9279728531837463
 
KG: RISK
Type: DEF, Phrase: an example of a DeFi insurance protocol
Score: 0.6123743653297424
 
KG: MIT
Type: RISK, Phrase: except that this tends to focus on risks specific to DeFi such as the risk of a hack or of a failure in a smart contract
Score: 0.9413577318191528
 
KG: FS
Type: PERM, Phrase: insurance
Score: 0.17339462041854858
 
KG: ACT
Type: MIT, Phrase: hack
Score: 0.3011421859264374
 
KG: FS
Type: DEF, Phrase: market
Score: 0.2925827205181122
 
KG: DEF
Type: DEF, Phrase: regulators
Score: 0.3736037313938141
 
KG: MIT
T

In [43]:
import torch

# Read the JSON file
with open('/content/drive/Othercomputers/MBZUAI/MBZUAI/Codes/samplePhrasesandTags100.json', 'r', encoding='utf-8-sig') as json_file:
    sample_data = json.load(json_file)

# Extract values from the JSON data and populate the lists
SamplePhrases = [data['phrase'] for data in sample_data]
SampleTags = [data['label_'] for data in sample_data]


for t in range(len(SamplePhrases)):
  print(("KG: " +SampleTags[t]))
  # Tokenize the input sentence
  #input_sentence = "the aggregate of each of the following in, or relating to, the Clients portfolio at the close of business on the valuation date."
  tokenized_input = tokenizer(SamplePhrases[t], truncation=True, padding='max_length', max_length=max_seq_length, return_tensors='pt', return_attention_mask=True)

  # Pass the tokenized input through the model
  model.eval()
  with torch.no_grad():
      output = model(**tokenized_input)

  # Interpret the model's output
  logits = output.logits
  predicted_labels = torch.sigmoid(logits).cpu().numpy()[0]

  # Identify the predicted phrases and labels
  predicted_phrases_and_labels = []
  for i, label_type in enumerate(label_types):
      if predicted_labels[i] > 0.5:
          predicted_phrases_and_labels.append({
              "Type": label_type,
              "Phrase": SamplePhrases[t]  # In this example, we assume the entire sentence is the phrase
          })

  # Display the predicted phrases and labels
  print("Predicted Phrases and Labels:")
  for item in predicted_phrases_and_labels:
      print(f"Type: {item['Type']}, Phrase: {item['Phrase']}")
  print(" ")



KG: FS
Predicted Phrases and Labels:
 
KG: MIT
Predicted Phrases and Labels:
Type: DEF, Phrase: Executive/CE
 
KG: TECH
Predicted Phrases and Labels:
Type: DEF, Phrase: Illustrative Regulatory Framework
Type: RISK, Phrase: Illustrative Regulatory Framework
 
KG: MIT
Predicted Phrases and Labels:
Type: MIT, Phrase: might take in the medium term (i.e. five to ten years
 
KG: MIT
Predicted Phrases and Labels:
Type: MIT, Phrase: We propose to include a section in the concept paper that describes a future
Type: TECH, Phrase: We propose to include a section in the concept paper that describes a future
 
KG: RISK
Predicted Phrases and Labels:
Type: DEF, Phrase: an example of a DeFi insurance protocol
 
KG: MIT
Predicted Phrases and Labels:
Type: DEF, Phrase: except that this tends to focus on risks specific to DeFi such as the risk of a hack or of a failure in a smart contract
Type: RISK, Phrase: except that this tends to focus on risks specific to DeFi such as the risk of a hack or of a fail