In [1]:

!pip install transformers[torch] accelerate --quiet

In [2]:
import numpy as np
import pandas as pd

In [3]:
def load_data(gt_path, data_path):
	with open(data_path) as f:
		data = f.read().splitlines()

		for i, w in enumerate(data):
			if w == ";;;":
				data[i] = "###"
			else:
				data[i] = data[i].strip()

		data = "".join(data).split("###")
		for i, t in enumerate(data):
			data[i] = [x for x in t.split(";;;") if x != ""]

		data = [x for x in data if x != [] and x != [""]]

	with open(gt_path, "r") as f:
		labels = f.read().splitlines()

		for i, l in enumerate(labels):
			if l == "":
				labels[i] = "###"
			else:
				labels[i] = l.strip() + ";;;"

		labels = "".join(labels).split("###")
		for i, l in enumerate(labels):
			labels[i] = [x for x in l.split(";;;") if x != ""]
		labels = [x for x in labels if x != [] and x != [""]]


	text = []
	gt = []

	for d, l in zip(data, labels):
		sentence = []
		sentence_ner = []
		for i, (w, ner) in enumerate(zip(d, l)):
			if ";" not in ner:
				sentence.append(w)
				sentence_ner.append(ner)

		text.append(sentence)
		gt.append(sentence_ner)

	return text, gt

In [4]:
train_data, train_labels = load_data("train_gt.csv", "train_data.csv")
val_data, val_labels = load_data("valid_gt.csv", "valid_data.csv")

In [5]:
train = pd.DataFrame({"text": train_data, "ner": train_labels})
val = pd.DataFrame({"text": val_data, "ner": val_labels})

In [6]:
for i, row in train.iterrows():
	assert len(row["text"]) == len(row["ner"])

In [7]:
# unnest train labels

unique_labels = set([x for y in train_labels for x in y])
print(unique_labels)

{'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O', 'I-MISC', 'B-LOC', 'B-MISC'}


In [10]:
from transformers import DistilBertTokenizerFast, AutoTokenizer
import torch
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
def prepare(text, labels):


  tokenized_inputs = {'input_ids': [], 'attention_mask': [], 'labels': []}

  for sentence, label in zip(text, labels):
      tokenized_output = tokenizer(sentence, is_split_into_words=True, truncation=True, padding='max_length', max_length=128, return_tensors="pt")
      input_ids = tokenized_output['input_ids'][0]
      attention_mask = tokenized_output['attention_mask'][0]

      # Align the labels here
      # Initialize with -100 to ignore special tokens
      aligned_labels = [-100] * len(input_ids)

      # Update `aligned_labels` with actual labels, taking care to skip special tokens

      tokenized_inputs['input_ids'].append(input_ids)
      tokenized_inputs['attention_mask'].append(attention_mask)
      tokenized_inputs['labels'].append(torch.tensor(aligned_labels, dtype=torch.long))

  # Convert lists to tensors
  tokenized_inputs['input_ids'] = torch.stack(tokenized_inputs['input_ids'])
  tokenized_inputs['attention_mask'] = torch.stack(tokenized_inputs['attention_mask'])
  tokenized_inputs['labels'] = torch.stack(tokenized_inputs['labels'])


  return tokenized_inputs

In [11]:

train_inputs = prepare(train_data, train_labels)
val_inputs = prepare(val_data, val_labels)

In [12]:

from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(unique_labels))

# Define the mapping of labels to indices
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

model.config.label2id = label2id
model.config.id2label = id2label

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)


In [15]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        # Return a single tokenized example and its corresponding labels
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = NERDataset(train_inputs)
val_dataset = NERDataset(val_inputs)

In [17]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=100,
    weight_decay=0.1,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    # data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [None]:
predictions, label_ids, metrics = trainer.predict(val_dataset)