In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

In [6]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd

# Read the tab-separated file into a DataFrame
df = pd.read_csv('/content/train.txt', sep=' ')
test_df=pd.read_csv('/content/test.txt',sep=' ')
valid_df=pd.read_csv('/content/valid.txt',sep=' ')
# Assign column names
df.columns = ['label', 'POS', 'syntactic_chunk', 'named_entity']
test_df.columns = ['label', 'POS', 'syntactic_chunk', 'named_entity']
valid_df.columns = ['label', 'POS', 'syntactic_chunk', 'named_entity']
# Display the DataFrame
# print(df)
df=df.dropna()
df

Unnamed: 0,label,POS,syntactic_chunk,named_entity
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O
...,...,...,...,...
204561,three,CD,I-NP,O
204562,Swansea,NN,B-NP,B-ORG
204563,1,CD,I-NP,O
204564,Lincoln,NNP,I-NP,B-ORG


In [9]:

# Initialize empty lists to store merged sentences and tags
sentences = []
sentence = []
tags = []
tag = []

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    # Append the word to the current sentence
    sentence.append(row['label'])
    # Append the tag to the current tag list
    tag.append(row['syntactic_chunk'])

    # Check if the word is a period
    if row['label'] == '.':
        # Merge the words in the sentence and append to the list of sentences
        if '-DOCSTART-' not in sentence:
            sentences.append(' '.join(sentence))
            # Append the tags to the list of tags
            tags.append(tag)

        # Reset the sentence and tag list for the next sentence
        sentence = []
        tag = []

# If the last sentence didn't end with a period, append it as well
if sentence:
    sentences.append(' '.join(sentence))
    tags.append(tag)

# Convert the lists of sentences and tags to DataFrames
sentences_df = pd.DataFrame(sentences, columns=['sentence'],index=None)
tags_df = pd.DataFrame({'tag': [sublist for sublist in tags]},index=None)


# Merge sentences and tags into a single DataFrame
merged_df = pd.concat([sentences_df, tags_df], axis=1)

# Display the DataFrame
print(merged_df)


                                               sentence  \
0      EU rejects German call to boycott British lamb .   
1     Peter Blackburn BRUSSELS 1996-08-22 The Europe...   
2     Germany 's representative to the European Unio...   
3     We do n't support any such recommendation beca...   
4     He said further scientific study was required ...   
...                                                 ...   
6425  I 'm a very good friend of David and spoke to ...   
6426  He said I would really enjoy life there and th...   
6427  That , and the fact he is only a few hours dri...   
6428  NORTHAMPTON , England 1996-08-30 Leading score...   
6429  LONDON 1996-08-30 Results of English league ma...   

                                                    tag  
0     [B-NP, B-VP, B-NP, I-NP, B-VP, I-VP, B-NP, I-N...  
1     [B-NP, I-NP, B-NP, I-NP, B-NP, I-NP, I-NP, B-V...  
2     [B-NP, B-NP, I-NP, B-PP, B-NP, I-NP, I-NP, B-N...  
3     [B-NP, B-VP, I-VP, I-VP, B-NP, I-NP, I-NP, B-S...  
4

In [10]:
# Convert lists to tuples in the 'tag' column
tag_set = set()

# Add unique tags to the set
for sublist in tags:
    tag_set.update(sublist)
# Create dictionaries for mapping labels to IDs and vice versa
labels_to_ids = {k: v for v, k in enumerate(tag_set)}
ids_to_labels = {v: k for k, v in enumerate(tag_set)}
labels_to_ids

{'B-NP': 0,
 'B-INTJ': 1,
 'B-PRT': 2,
 'B-LST': 3,
 'I-CONJP': 4,
 'I-NP': 5,
 'B-VP': 6,
 'I-VP': 7,
 'I-INTJ': 8,
 'B-SBAR': 9,
 'I-ADVP': 10,
 'B-PP': 11,
 'O': 12,
 'I-ADJP': 13,
 'I-SBAR': 14,
 'B-ADVP': 15,
 'B-ADJP': 16,
 'B-CONJP': 17,
 'I-PP': 18}

In [11]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [12]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        tag = self.data.tag[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, tag, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [labels_to_ids[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [13]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
training_set = dataset(merged_df, tokenizer, MAX_LEN)

In [15]:
training_set[0]

{'ids': tensor([  101,  7327, 19164,  2446,  2655,  2000, 17757,  2329, 12559,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [16]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"]), training_set[0]["targets"]):
  print((token, label))

('[CLS]', tensor(12))
('eu', tensor(0))
('rejects', tensor(6))
('german', tensor(0))
('call', tensor(5))
('to', tensor(6))
('boycott', tensor(7))
('british', tensor(0))
('lamb', tensor(5))
('.', tensor(12))
('[SEP]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
('[PAD]', tensor(12))
(

In [17]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [18]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.9833, device='cuda:0', grad_fn=<NllLossBackward0>)

In [19]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 19])

In [20]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [21]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

# test_params = {'batch_size': VALID_BATCH_SIZE,
#                 'shuffle': True,
#                 'num_workers': 0
#                 }

training_loader = DataLoader(training_set, **train_params)
# testing_loader = DataLoader(testing_set, **test_params)

In [22]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        a = model(input_ids=ids, attention_mask=mask, labels=targets)
        tr_loss += a.loss

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = a.logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        #backward pass
        optimizer.zero_grad()
        a.loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [23]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 3.0647923946380615
Training loss per 100 training steps: 1.0017155408859253
Training loss per 100 training steps: 0.6602632999420166
Training loss per 100 training steps: 0.4979492127895355
Training loss per 100 training steps: 0.4039662182331085
Training loss per 100 training steps: 0.3435783088207245
Training loss per 100 training steps: 0.3037550747394562
Training loss per 100 training steps: 0.27197641134262085
Training loss per 100 training steps: 0.24717657268047333
Training loss per 100 training steps: 0.2296021282672882
Training loss per 100 training steps: 0.21406590938568115
Training loss per 100 training steps: 0.20018303394317627
Training loss per 100 training steps: 0.18954378366470337
Training loss per 100 training steps: 0.18003933131694794
Training loss per 100 training steps: 0.17191770672798157
Training loss per 100 training steps: 0.1647678166627884
Training loss per 100 training steps: 0.15818671882152557
Train

In [24]:
testing_set = dataset(merged_df, tokenizer, MAX_LEN)
testing_set[0]
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
testing_loader = DataLoader(testing_set, **test_params)

In [38]:

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            a = model(input_ids=ids, attention_mask=mask, labels=targets)

            eval_loss += a.loss

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = a.logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    print(eval_labels)
    print(eval_preds)
    label_ids = [labels_to_ids[id.item()] for id in eval_labels if id.item() in labels_to_ids]

    predictions = [ids_to_labels[id.item()] for id in eval_preds if id.item() in ids_to_labels]


    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return eval_labels, predictions

In [39]:
valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.022593634203076363
Validation loss per 100 evaluation steps: 0.04680808261036873
Validation loss per 100 evaluation steps: 0.0475323349237442
Validation loss per 100 evaluation steps: 0.051171429455280304
Validation loss per 100 evaluation steps: 0.05316785350441933
Validation loss per 100 evaluation steps: 0.05333812162280083
Validation loss per 100 evaluation steps: 0.05108607932925224
Validation loss per 100 evaluation steps: 0.05065092444419861
Validation loss per 100 evaluation steps: 0.05105582997202873
Validation loss per 100 evaluation steps: 0.051424406468868256
Validation loss per 100 evaluation steps: 0.05163519084453583
Validation loss per 100 evaluation steps: 0.05164315924048424
Validation loss per 100 evaluation steps: 0.051340531557798386
Validation loss per 100 evaluation steps: 0.052637550979852676
Validation loss per 100 evaluation steps: 0.052289169281721115
Validation loss per 100 evaluation steps: 0.0522535964846611
Vali

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Validation Loss: 0.05273348093032837
Validation Accuracy: 0.94902626383265


([tensor(12, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(12, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(5, device='cuda:0'),
  tensor(11, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(6, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(6, device='cuda:0'),
  tensor(7, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(11, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(12, device='cuda:0'),
  tensor(12, device='cuda:0'),
  tensor(12, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(6, device='cuda:0'),
  tensor(16, device='cuda:0'),
  tensor(13, device='cuda:0'),
  tensor(6, device='cuda:0'),
  tensor(7, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(11, device='cuda:0'),
  tensor(0, device='cuda:0'),
  tensor(5, device='cuda:0'),
  tensor(5, device='cuda:0'),
  tensor(11, device='cuda:0'),

In [35]:
sentence = "India has a capital called Mumbai. On wednesday, the president will give a presentation"

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
print(flattened_predictions)
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
print(ids_to_labels)
ids_to_labels = {v: k for k, v in labels_to_ids.items()}
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

tensor([12,  0,  6,  0,  5,  6,  0, 12, 11,  0, 12,  0,  5,  6,  7,  0,  5, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12], device='cuda:0')
{'B-NP': 0, 'B-INTJ': 1, 'B-PRT': 2, 'B-LST': 3, 'I-CONJP': 4, 'I-NP': 5, 'B-VP': 6, 'I-VP': 7, 'I-INTJ': 8, 'B-SBAR': 9, 'I-ADVP': 10, 'B-PP': 11, 'O': 12, 'I-ADJP': 13, 'I-SBAR': 14, 'B-ADVP': 15, 'B-ADJP': 16, 'B-CONJP': 17, 'I-PP': 18}
india has a capital called mumbai . on wednesday , the president will give a presentation
['B-NP', 'B-VP', 'B-NP', 'I-NP', 'B-VP', 'B-NP', 'O', 'B-PP', 'B-NP', 'O', '