In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd

# Read the tab-separated file into a DataFrame
df = pd.read_csv('/content/train.txt', sep=' ')
test_df=pd.read_csv('/content/test.txt',sep=' ')
valid_df=pd.read_csv('/content/valid.txt',sep=' ')
# Assign column names
df.columns = ['label', 'POS', 'syntactic_chunk', 'named_entity']
test_df.columns = ['label', 'POS', 'syntactic_chunk', 'named_entity']
valid_df.columns = ['label', 'POS', 'syntactic_chunk', 'named_entity']
# Display the DataFrame
# print(df)
df=df.dropna()
df

Unnamed: 0,label,POS,syntactic_chunk,named_entity
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O
...,...,...,...,...
204561,three,CD,I-NP,O
204562,Swansea,NN,B-NP,B-ORG
204563,1,CD,I-NP,O
204564,Lincoln,NNP,I-NP,B-ORG


In [5]:

# Initialize empty lists to store merged sentences and tags
sentences = []
sentence = []
tags = []
tag = []

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    # Append the word to the current sentence
    sentence.append(row['label'])
    # Append the tag to the current tag list
    tag.append(row['syntactic_chunk'])

    # Check if the word is a period
    if row['label'] == '.':
        # Merge the words in the sentence and append to the list of sentences
        if '-DOCSTART-' not in sentence:
            sentences.append(' '.join(sentence))
            # Append the tags to the list of tags
            tags.append(tag)

        # Reset the sentence and tag list for the next sentence
        sentence = []
        tag = []

# If the last sentence didn't end with a period, append it as well
if sentence:
    sentences.append(' '.join(sentence))
    tags.append(tag)

# Convert the lists of sentences and tags to DataFrames
sentences_df = pd.DataFrame(sentences, columns=['sentence'],index=None)
tags_df = pd.DataFrame({'tag': [sublist for sublist in tags]},index=None)


# Merge sentences and tags into a single DataFrame
merged_df = pd.concat([sentences_df, tags_df], axis=1)

# Display the DataFrame
print(merged_df)


                                               sentence  \
0      EU rejects German call to boycott British lamb .   
1     Peter Blackburn BRUSSELS 1996-08-22 The Europe...   
2     Germany 's representative to the European Unio...   
3     We do n't support any such recommendation beca...   
4     He said further scientific study was required ...   
...                                                 ...   
6425  I 'm a very good friend of David and spoke to ...   
6426  He said I would really enjoy life there and th...   
6427  That , and the fact he is only a few hours dri...   
6428  NORTHAMPTON , England 1996-08-30 Leading score...   
6429  LONDON 1996-08-30 Results of English league ma...   

                                                    tag  
0     [B-NP, B-VP, B-NP, I-NP, B-VP, I-VP, B-NP, I-N...  
1     [B-NP, I-NP, B-NP, I-NP, B-NP, I-NP, I-NP, B-V...  
2     [B-NP, B-NP, I-NP, B-PP, B-NP, I-NP, I-NP, B-N...  
3     [B-NP, B-VP, I-VP, I-VP, B-NP, I-NP, I-NP, B-S...  
4

In [6]:
# Convert lists to tuples in the 'tag' column
tag_set = set()

# Add unique tags to the set
for sublist in tags:
    tag_set.update(sublist)
# Create dictionaries for mapping labels to IDs and vice versa
labels_to_ids = {k: v for v, k in enumerate(tag_set)}
ids_to_labels = {v: k for k, v in enumerate(tag_set)}
labels_to_ids

{'I-NP': 0,
 'O': 1,
 'B-ADJP': 2,
 'I-CONJP': 3,
 'I-PP': 4,
 'B-PP': 5,
 'B-PRT': 6,
 'B-CONJP': 7,
 'B-SBAR': 8,
 'I-ADJP': 9,
 'I-ADVP': 10,
 'I-SBAR': 11,
 'I-VP': 12,
 'I-INTJ': 13,
 'B-VP': 14,
 'B-LST': 15,
 'B-INTJ': 16,
 'B-ADVP': 17,
 'B-NP': 18}

In [7]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [8]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        tag = self.data.tag[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, tag, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [labels_to_ids[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [9]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
training_set = dataset(merged_df, tokenizer, MAX_LEN)

In [11]:
training_set[0]

{'ids': tensor([  101,  7327, 19164,  2446,  2655,  2000, 17757,  2329, 12559,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [12]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"]), training_set[0]["targets"]):
  print((token, label))

('[CLS]', tensor(1))
('eu', tensor(18))
('rejects', tensor(14))
('german', tensor(18))
('call', tensor(0))
('to', tensor(14))
('boycott', tensor(12))
('british', tensor(18))
('lamb', tensor(0))
('.', tensor(1))
('[SEP]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', tensor(1))
('[PAD]', te

In [13]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [14]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.9496, grad_fn=<NllLossBackward0>)

In [15]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 19])

In [16]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [17]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

# test_params = {'batch_size': VALID_BATCH_SIZE,
#                 'shuffle': True,
#                 'num_workers': 0
#                 }

training_loader = DataLoader(training_set, **train_params)
# testing_loader = DataLoader(testing_set, **test_params)

In [19]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        a = model(input_ids=ids, attention_mask=mask, labels=targets)
        tr_loss += a.loss

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = a.logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        #backward pass
        optimizer.zero_grad()
        a.loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.9724178314208984
