In [2]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [3]:
import torch
import torch.nn as nn
from transformers import RobertaModel
from torchcrf import CRF

class NERModel(nn.Module):
    def __init__(self, num_labels, hidden_size):
        super(NERModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained("FacebookAI/roberta-base")
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        if labels is not None:
#             print(logits.size())
#             print(labels.size())
#             print(attention_mask.size())
            loss = -self.crf(logits, labels, mask=attention_mask.byte(), reduction='mean')
            return loss
        else:
            return logits


In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
label_map = {
    'B-Facility': 1,
    'I-Facility': 2,
    'B-OtherLOC': 3,
    'I-OtherLOC': 4,
    'B-HumanSettlement': 5,
    'I-HumanSettlement': 6,
    'B-Station': 7,
    'I-Station': 8,
    'B-VisualWork': 9,
    'I-VisualWork': 10,
    'B-MusicalWork': 11,
    'I-MusicalWork': 12,
    'B-WrittenWork': 13,
    'I-WrittenWork': 14,
    'B-ArtWork': 15,
    'I-ArtWork': 16,
    'B-Software': 17,
    'I-Software': 18,
    'B-MusicalGRP': 19,
    'I-MusicalGRP': 20,
    'B-PublicCorp': 21,
    'I-PublicCorp': 22,
    'B-PrivateCorp': 23,
    'I-PrivateCorp': 24,
    'B-AerospaceManufacturer': 25,
    'I-AerospaceManufacturer': 26,
    'B-SportsGRP': 27,
    'I-SportsGRP': 28,
    'B-CarManufacturer': 29,
    'I-CarManufacturer': 30,
    'B-ORG': 31,
    'I-ORG': 32,
    'B-Scientist': 33,
    'I-Scientist': 34,
    'B-Artist': 35,
    'I-Artist': 36,
    'B-Athlete': 37,
    'I-Athlete': 38,
    'B-Politician': 39,
    'I-Politician': 40,
    'B-Cleric': 41,
    'I-Cleric': 42,
    'B-SportsManager': 43,
    'I-SportsManager': 44,
    'B-OtherPER': 45,
    'I-OtherPER': 46,
    'B-Clothing': 47,
    'I-Clothing': 48,
    'B-Vehicle': 49,
    'I-Vehicle': 50,
    'B-Food': 51,
    'I-Food': 52,
    'B-Drink': 53,
    'I-Drink': 54,
    'B-OtherPROD': 55,
    'I-OtherPROD': 56,
    'B-Medication/Vaccine': 57,
    'I-Medication/Vaccine': 58,
    'B-MedicalProcedure': 59,
    'I-MedicalProcedure': 60,
    'B-AnatomicalStructure': 61,
    'I-AnatomicalStructure': 62,
    'B-Symptom': 63,
    'I-Symptom': 64,
    'B-Disease': 65,
    'I-Disease': 66,
    'O': 67
}


In [5]:
new_label_map = {key: label - 1 for key, label in label_map.items()}

In [6]:
label_map=new_label_map

In [7]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids_batch, attention_mask_batch, label_ids_batch = zip(*batch)
#     print(input_ids_batch)
#     print(attention_mask_batch)
#     print(label_ids_batch)
    # Pad sequences to the maximum length in the batch
    input_ids_batch = pad_sequence(input_ids_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask_batch = pad_sequence(attention_mask_batch, batch_first=True, padding_value=0)  # 0 is used as the default attention mask value
    label_ids_batch = pad_sequence(label_ids_batch, batch_first=True, padding_value=-100)  # -100 is used as the default label ID value for padding

    return input_ids_batch, attention_mask_batch, label_ids_batch


In [8]:
def tokenize_and_preserve_labels(tokens, la):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [9]:
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer

class MultiCoNERDataset(Dataset):
    def __init__(self, file_path, tokenizer, regime="train"):
        self.samples = self._read_data(file_path)
        self.tokenizer = tokenizer
        self.regime = regime

    def _read_data(self, file_path):
        samples = []
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            sample = {"tokens": [], "labels": []}
            for line in lines:
                line = line.strip()
                if line.startswith("# id"):
                    if sample["tokens"]:
                        samples.append(sample)
                        sample = {"tokens": [], "labels": []}
                elif line:
                    parts = line.split()
                    token, label = parts[0], parts[-1]
                    sample["tokens"].append(token)
                    sample["labels"].append(label)
            if sample["tokens"]:
                samples.append(sample)
        return samples

    def __len__(self):
        if self.regime =="validation":
          return 200
        elif self.regime =="test":
          return 500
        else:
          return len(self.samples)


    def __getitem__(self, idx):
        global label_map
        sample = self.samples[idx]
        tokens = sample["tokens"]
        labels = sample["labels"]
        # Tokenize and encode inputs
        inputs = self.tokenizer(tokens, return_tensors='pt',padding="max_length", truncation=True, is_split_into_words=True)
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        label_ids = [label_map[label] for label in labels]
        label_ids = label_ids + [-1] * (self.tokenizer.model_max_length - len(label_ids))
#         print(label_ids)
#         print("TENSOR SIZE:")
#         print(input_ids.size())
#         print(attention_mask.size())
#         print(len(label_ids))
#         print(label_ids)
        return input_ids, attention_mask, torch.tensor(label_ids)

In [10]:
import torch
from transformers import RobertaTokenizer, RobertaConfig
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, precision_score, recall_score
PATH="/content/drive/MyDrive/Colab Notebooks/NLP/multiconer2023"

# Define function for training and checkpointing
def train_and_checkpoint(model, language, train_loader, val_loader):
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    best_val_loss = float('inf')
    best_model_state_dict = None
    for epoch in range(5):  # Example: 3 epochs
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            optimizer.zero_grad()
            loss = model(input_ids, attention_mask, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            with torch.no_grad():
                loss = model(input_ids, attention_mask, labels)
                logits = model(input_ids, attention_mask)
                preds = torch.argmax(logits, dim=2)
                all_preds.extend(preds.cpu().numpy().flatten())
                all_labels.extend(labels.cpu().numpy().flatten())
            total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        val_precision = precision_score(all_labels, all_preds, average='micro')
        val_recall = recall_score(all_labels, all_preds, average='micro')
        val_f1 = f1_score(all_labels, all_preds, average='micro')
        print(f"Epoch {epoch+1}/{5}, Language: {language}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        print(f"Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}")

        # Check if current model is the best performing one
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state_dict = model.state_dict()

    # Save the best performing model checkpoint
    torch.save(best_model_state_dict, os.path.join(f"/kaggle/working/{language}_best_model.pth"))

# Define function for testing
def test(language, test_loader):
    config = XLMRobertaConfig.from_pretrained("FacebookAI/roberta-base")
    num_labels = len(label_map)
    print(num_labels, config.hidden_size)
    # Load model checkpoint
    model = NERModel(num_labels, config.hidden_size)
    freeze_weignts(model)
    model.load_state_dict(torch.load(f"/kaggle/working/{language}_best_model.pth"))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Lists to store true labels and predicted labels
    all_true_labels = []
    all_predicted_labels = []

    # Testing loop
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        with torch.no_grad():
            # Pass data through model
            logits = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=2)
            # Convert labels and predictions to numpy arrays
            true_labels = labels.cpu().numpy().flatten()
            predicted_labels = predictions.cpu().numpy().flatten()
            # Append true and predicted labels to lists
            all_true_labels.extend(true_labels)
            all_predicted_labels.extend(predicted_labels)

    # Calculate F1 score
    test_f1 = f1_score(all_true_labels, all_predicted_labels, average='micro')
    print(f"Test F1 Score for {language}: {test_f1:.4f}")

In [11]:
# os.path.join(PATH,language+"_best_model.pth")

In [12]:

model_layers=["roberta.encoder.layer.9.attention.self.query.weight","roberta.encoder.layer.9.attention.self.query.bias",
              "roberta.encoder.layer.9.attention.self.key.weight","roberta.encoder.layer.9.attention.self.key.bias",
              "roberta.encoder.layer.9.attention.self.value.weight","roberta.encoder.layer.9.attention.self.value.bias",
              "roberta.encoder.layer.9.attention.output.dense.weight","roberta.encoder.layer.9.attention.output.dense.bias",
              "roberta.encoder.layer.9.attention.output.LayerNorm.weight","roberta.encoder.layer.9.attention.output.LayerNorm.bias",
              "roberta.encoder.layer.9.intermediate.dense.weight","roberta.encoder.layer.9.intermediate.dense.bias",
              "roberta.encoder.layer.9.output.dense.weight","roberta.encoder.layer.9.output.dense.bias",
              "roberta.encoder.layer.9.output.LayerNorm.weight","roberta.encoder.layer.9.output.LayerNorm.bias",
              "roberta.encoder.layer.10.attention.self.query.weight","roberta.encoder.layer.10.attention.self.query.bias",
              "roberta.encoder.layer.10.attention.self.key.weight","roberta.encoder.layer.10.attention.self.key.bias",
              "roberta.encoder.layer.10.attention.self.value.weight", "roberta.encoder.layer.10.attention.self.value.bias",
              "roberta.encoder.layer.10.attention.output.dense.weight","roberta.encoder.layer.10.attention.output.dense.bias",
"roberta.encoder.layer.10.attention.output.LayerNorm.weight",
"roberta.encoder.layer.10.attention.output.LayerNorm.bias",
"roberta.encoder.layer.10.intermediate.dense.weight",
"roberta.encoder.layer.10.intermediate.dense.bias",
"roberta.encoder.layer.10.output.dense.weight",
"roberta.encoder.layer.10.output.dense.bias",
"roberta.encoder.layer.10.output.LayerNorm.weight",
"roberta.encoder.layer.10.output.LayerNorm.bias",
"roberta.encoder.layer.11.attention.self.query.weight",
"roberta.encoder.layer.11.attention.self.query.bias",
"roberta.encoder.layer.11.attention.self.key.weight",
"roberta.encoder.layer.11.attention.self.key.bias",
"roberta.encoder.layer.11.attention.self.value.weight",
"roberta.encoder.layer.11.attention.self.value.bias",
      "roberta.encoder.layer.11.attention.output.dense.weight",
"roberta.encoder.layer.11.attention.output.dense.bias",
"roberta.encoder.layer.11.attention.output.LayerNorm.weight",
"roberta.encoder.layer.11.attention.output.LayerNorm.bias",
"roberta.encoder.layer.11.intermediate.dense.weight",
"roberta.encoder.layer.11.intermediate.dense.bias",
"roberta.encoder.layer.11.output.dense.weight",
"roberta.encoder.layer.11.output.dense.bias",
"roberta.encoder.layer.11.output.LayerNorm.weight",
"roberta.encoder.layer.11.output.LayerNorm.bias",
"roberta.pooler.dense.weight",
 "roberta.pooler.dense.bias",
"classifier.weight",
 "classifier.bias",
"crf.start_transitions",
"crf.end_transitions",
"crf.transitions"]


def freeze_weignts(model):
    print("freezing weignts")
    for name, para in model.named_parameters():
        if name not in model_layers:
             para.requires_grad = False

In [None]:
import os

PATH="/kaggle/input/multiconer/multiconer2023"
# Example usage
for language in os.listdir(PATH):
    print("+"*20)
    locale = language.split("-")[0].lower()
    if len(locale)==2 and locale in ["de", "hi", "fr"]:
#         dev_path = PATH +"/"+language+"/"+locale+"_dev.conll"
        test_path = PATH +"/"+language+"/"+locale+"_test.conll"
#         train_path = PATH +"/"+language+"/"+locale+"_train.conll"
#         print(dev_path,os.path.isfile(dev_path))
#         print(test_path,os.path.isfile(test_path))
        print(train_path,os.path.isfile(train_path))
        tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
        config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
        num_labels = len(label_map)
        model = NERModel(num_labels, config.hidden_size)
        freeze_weignts(model)
        print(num_labels,config.hidden_size)
#         train_dataset = MultiCoNERDataset(train_path, tokenizer)  # Create train dataset for language
#         val_dataset = MultiCoNERDataset(dev_path,tokenizer, regime="validation")  # Create validation dataset for language
        test_dataset = MultiCoNERDataset(test_path, tokenizer, regime="test")  # Create test dataset for language
#         train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,collate_fn=collate_fn)  # Train DataLoader
#         val_loader = DataLoader(val_dataset, batch_size=32,collate_fn=collate_fn)  # Validation DataLoader
        test_loader = DataLoader(test_dataset, batch_size=32,collate_fn=collate_fn)  # Test DataLoader
#         train_and_checkpoint(model, locale, train_loader, val_loader)  # Train and checkpoint
        print(f'Test for {local} language')
        test(locale, test_loader)


In [None]:
# Example usage
# from transformers import XLMRobertaTokenizer, XLMRobertaConfig
# file_path = "/kaggle/input/multiconer/multiconer2023/BN-Bangla/bn_train.conll"  # Path to your dataset file
# tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")  # Use XLM-RoBERTa tokenizer
# dataset = MultiCoNERDataset(file_path, tokenizer)

# # Example of accessing a sample
# sample_idx = 0
# input_ids, attention_mask, labels = dataset[sample_idx]
# print("Input IDs:", input_ids.size())
# print("Attention Mask:", attention_mask.size())
# print("Labels:", labels.size())


Tokens before padding: ['স্টেশনটি', 'প্ল্যাটফর্ম', 'স্ক্রিন', 'ডোর', 'দিয়ে', 'সজ্জিত।']
Labels before padding: ['O', 'B-OtherPROD', 'I-OtherPROD', 'I-OtherPROD', 'O', 'O']
Input IDs: torch.Size([512])
Attention Mask: torch.Size([512])
Labels: torch.Size([6])


In [None]:
# def tokenize_and_align_labels(examples, tokenizer, label_all_tokens=True):
#     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
#     labels = []
#     for i, label in enumerate(examples["ner_tags"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif word_idx != previous_word_idx:
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(label[word_idx] if label_all_tokens else -100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)
#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

In [None]:
# d={}
# d["tokens"] = ['karla ', 'cossío ', 'as ', 'pilar ', 'gandía ', '( ', 'recurring ', 'season', '1', ';', 'guest', '2', ')']
# d["ner_tags"] = ['B-Artist', 'I-Artist', 'O', 'B-VisualWork', 'I-VisualWork', 'O', 'O', 'O', 'O', 'O','O','O','O']

In [None]:
# from transformers import AutoTokenizer

# xlmr_model_name = "xlm-roberta-base"
# xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

# text = "Jack Sparrow loves New York!"
# text2=text.split(" ")
# #xlmr_tokens = xlmr_tokenizer(text).tokens()
# a=tokenize_and_align_labels(d, xlmr_tokenizer)
# print(a)

ValueError: word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast` class).

In [16]:
# !pip install tner

Collecting tner

  Using cached tner-0.2.4.tar.gz (2.2 MB)

  Preparing metadata (setup.py) ... [?25l[?25hdone


Collecting allennlp>=2.0.0 (from tner)

  Using cached allennlp-2.10.1-py3-none-any.whl (730 kB)



Collecting seqeval (from tner)

  Using cached seqeval-1.2.2.tar.gz (43 kB)

  Preparing metadata (setup.py) ... [?25l[?25hdone

Collecting datasets (from tner)

  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m

[?25hCollecting torch (from tner)

  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m


[0mCollecting torchvision<0.14.0,>=0.8.1 (from allennlp>=2.0.0->tner)

  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl (19.1 MB)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m