In [1]:
import accelerate
import transfarmers

transformers.__version__, accelerate.__version__

  from .autonotebook import tqdm as notebook_tqdm


('4.31.0', '0.24.1')

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, AdamW
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F  # Import the functional module to apply softmax

In [3]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

In [4]:
torch.cuda.is_available()

True

# Model Architecture

In [5]:
class ManageDataset(Dataset):
    def __init__(self, tokenizer, sentences, labels):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.labels = labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        # Tokenize the sentence into BERT tokens
        inputs = self.tokenizer(self.sentences[idx], return_tensors="pt", truncation=True, padding='max_length', max_length=512)

        # Generate the word-token map and collect "manag"-containing word tokens
        manag_mask = self._get_manag_mask(self.sentences[idx], inputs["input_ids"][0])

        # Return tokens' embeddings and the label
        return {
            "input_ids": inputs["input_ids"][0],
            "attention_mask": inputs["attention_mask"][0],
            "manag_mask": manag_mask,
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

    # get the index of tokens that are part of words that contain "manag"
    def _get_manag_mask(self, sentence, token_ids):
        word_tokens = sentence.split()
        manag_mask = torch.zeros_like(token_ids, dtype=torch.long)
        token_position = 0

        for word in word_tokens:
            word = word.lower()
            word_tokenized = self.tokenizer.tokenize(word)
            word_tokenized_len = len(word_tokenized)

            if "manag" in word:
                current_token_position = token_position

                # Find the start position of the word in the token_ids
                while current_token_position < len(token_ids) and self.tokenizer.convert_ids_to_tokens(token_ids[current_token_position].item()) not in word_tokenized:
                    current_token_position += 1

                # Set the mask for the length of the word, but do not exceed the length of manag_mask
                for _ in range(word_tokenized_len):
                    if current_token_position < len(manag_mask):  # Check to prevent index out-of-bounds
                        manag_mask[current_token_position] = 1
                        current_token_position += 1
                    else:
                        break  # Break if we reach the end of manag_mask
            token_position += word_tokenized_len

        return manag_mask

In [6]:
from transformers import BertModel, BertTokenizer, BertConfig
import torch.nn as nn
import os

class BERTForManageClassification(nn.Module):
    def __init__(self, load_bert_from_pretrained=True):
        super(BERTForManageClassification, self).__init__()
        if load_bert_from_pretrained:
            self.bert = BertModel.from_pretrained('bert-base-uncased')
        else:
            self.bert = BertModel(BertConfig())
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)  # 2 classes: A and B
        self.loss_fn = nn.CrossEntropyLoss()  # Instantiate the loss function

    def forward(self, input_ids, attention_mask, manag_mask, labels):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        # Use the manag_mask to extract relevant embeddings and compute their average
        manag_mask_expanded = manag_mask.unsqueeze(-1).expand_as(last_hidden_state).float()
        sum_embeddings = (last_hidden_state * manag_mask_expanded).sum(dim=1)
        num_manag_tokens = manag_mask.sum(dim=1).clamp(min=1)
        mean_embeddings = sum_embeddings / num_manag_tokens.unsqueeze(-1)

        # Ensure the shape is [batch_size, hidden_size]
        # assert mean_embeddings.shape == (8, 768)

        logits = self.classifier(mean_embeddings)
        loss = self.loss_fn(logits, labels)

        return loss, logits

    # for saving and loading model
    def save_pretrained(self, save_directory):
        # Ensure the save directory exists
        os.makedirs(save_directory, exist_ok=True)

        # Save model's state_dict
        torch.save(self.state_dict(), os.path.join(save_directory, "pytorch_model.bin"))

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path):
        model = cls(load_bert_from_pretrained=False)

        # Check if CUDA is available
        if torch.cuda.is_available():
            model.load_state_dict(torch.load(os.path.join(pretrained_model_name_or_path, "pytorch_model.bin")))
            model = model.to('cuda')
        else:
            model.load_state_dict(torch.load(os.path.join(pretrained_model_name_or_path, "pytorch_model.bin"), map_location=torch.device('cpu')))

        return model

In [7]:
def infer(sentences, model, tokenizer):
    dataset = ManageDataset(tokenizer, sentences, [0]*len(sentences))  # Dummy labels just for data processing
    loader = DataLoader(dataset, batch_size=300)  # Set batch size according to your needs

    model.eval()
    pred_labels = []
    confidences = []  # To store tae prediction confidences

    with torch.no_grad():
        for batch in tqdm(loader, desc="Inferencing", unit="batch"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            manag_mask = batch["manag_mask"].to(device)
            labels = batch["labels"].to(device)

            _, logits = model(input_ids, attention_mask, manag_mask, labels)

            # Convert logits to probabilities using softmax
            probs = F.softmax(logits, dim=1)

            # Get the predicted labels and their corresponding confidences
            preds = torch.argmax(logits, dim=1)
            conf = probs[range(probs.shape[0]), preds].tolist()  # Get the confidence of the predicted class for each sample

            pred_labels.extend(preds.tolist())
            confidences.extend(conf)

    return pred_labels, confidences  # Return both predicted labels and their confidences

# Inference

In [8]:
# load model from saved file
save_directory = "/home/ec2-user/SageMaker/data/BERT/finetune_bert_WSD_v2/"
loaded_tokenizer = BertTokenizer.from_pretrained(save_directory)
loaded_model = BERTForManageClassification.from_pretrained(save_directory)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

BERTForManageClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [9]:
torch.cuda.empty_cache()

In [11]:
def processing_piepline(i):
    with open(output_path+f"verb_sent_v{i}.pkl", "rb")as f:
        total_sentences = pickle.load(f)
    print("Total Number of Sentences", len(total_sentences))
    predictions, confidence_scores = infer(total_sentences, loaded_model, loaded_tokenizer)
    with open(output_path+f"predictions_v{i}.pkl", "wb")as f:
        pickle.dump(predictions, f)
    with open(output_path+f"confidence_scores_v{i}.pkl", "wb")as f:
        pickle.dump(confidence_scores, f)
    return

In [None]:
output_path = '/home/ec2-user/SageMaker/data/NYT-91-20_processed/'
for i in range(4):
    processing_piepline(i)

Total Number of Sentences 168515


Inferencing: 100%|██████████| 562/562 [48:52<00:00,  5.22s/batch]


Total Number of Sentences 168309


Inferencing: 100%|██████████| 562/562 [48:46<00:00,  5.21s/batch]


Total Number of Sentences 167920


Inferencing:  44%|████▍     | 245/560 [21:19<27:23,  5.22s/batch]

In [None]:
output_path = '/home/ec2-user/SageMaker/data/NYT-50-90_processed/'
for i in range(3):
    processing_piepline(i)