In [0]:
import pandas as pd
from tqdm import tqdm, trange
import numpy as np


In [3]:
!pip install seqeval
!pip install transformers

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7424 sha256=13bdac5d129e7816197161d0d01761643ac230f88710a625e471421e651d9f46
  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/12/b5/ac41e3e95205ebf53439e4dd087c58e9fd371fd8e3724f2b9b4cdb8282e5/transformers-2.10.0-py3-none-any.whl (660kB)
[K     |████████████████████████████████| 665kB 2.8MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca

In [0]:
from transformers import BertForTokenClassification, AdamW, get_linear_schedule_with_warmup

In [0]:
from seqeval.metrics import f1_score, accuracy_score

In [6]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [7]:
data = pd.read_csv("/content/drive/My Drive/ner_dataset.csv", encoding="latin1", nrows =1000).fillna(method="ffill")
data.tail(5)

Unnamed: 0,Sentence #,Word,POS,Tag
995,Sentence: 43,investigation,NN,O
996,Sentence: 43,of,IN,O
997,Sentence: 43,Khayam,NNP,B-per
998,Sentence: 43,'s,POS,O
999,Sentence: 43,behavior,NN,O


# New Section

# New Section

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
aggregation_function = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]

In [0]:
dataGrouped = data.groupby("Sentence #").apply(aggregation_function)                  

In [0]:
sentences = [[entr[0] for entr in sent] for sent in dataGrouped]
labels = [[entr[1] for entr in sent] for sent in dataGrouped]

In [12]:
vals_tag = list(set(data["Tag"].values))
vals_tag.append("PAD")
tag2id = {t: i for i, t in enumerate(vals_tag)}

tag2id

{'B-art': 2,
 'B-geo': 9,
 'B-gpe': 3,
 'B-org': 8,
 'B-per': 6,
 'B-tim': 10,
 'I-art': 5,
 'I-geo': 4,
 'I-org': 0,
 'I-per': 7,
 'O': 1,
 'PAD': 11}

In [0]:
"""
Use cuda or cpu
"""
processingUnit = "cuda" 

"""
Use only if GPU available
"""
n_gpu = torch.cuda.device_count()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()


In [14]:
torch.cuda.is_available()

True

In [15]:
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [17]:
len(sentences)

43

In [18]:
len(labels)

43

In [0]:
def tokenizeWords_prepareLabels(sent, sent_labels):

    labels = []
    tokenized_sent = []

    for word, lab in zip(sent, sent_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        num_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sent.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([lab] * num_subwords)
        
        
    #print(tokenized_word)
    #print(num_subwords)
    #print(tokenized_sent)
    #print(labels)
    
    return tokenized_sent, labels
        

In [0]:
tokenizeWords_prepareLabels(sentences[0], labels[0])

tokenizedWordsWithPreparedLabels = []

for x in range (0, len(sentences)):
    sent, lab = tokenizeWords_prepareLabels(sentences[x], labels[x])
    tokenizedWordsWithPreparedLabels.append((sent,lab))


In [21]:
print(tokenizedWordsWithPreparedLabels[0])
print(len(tokenizedWordsWithPreparedLabels))

(['Thousands', 'of', 'demons', '##tra', '##tors', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O'])
43


In [0]:
tokenized_texts = [token_lab_pair[0] for token_lab_pair in tokenizedWordsWithPreparedLabels]
labels = [token_lab_pair[1] for token_lab_pair in tokenizedWordsWithPreparedLabels]


In [0]:
"""
Best according to bert paper
"""
MAX_LENGTH = 75
batch_size = 32

In [0]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LENGTH, dtype="long", truncating="post", padding="post")


In [0]:
tags = pad_sequences([[tag2id.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LENGTH, value=tag2id["PAD"], padding="post",
                     dtype="long", truncating="post")

In [26]:
input_ids[0]

array([26159,  1104,  8568,  4487,  5067,  1138,  9639,  1194,  1498,
        1106,  5641,  1103,  1594,  1107,  5008,  1105,  4555,  1103,
       10602,  1104,  1418,  2830,  1121,  1115,  1583,   119,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0])

In [27]:
vals_tag

['I-org',
 'O',
 'B-art',
 'B-gpe',
 'I-geo',
 'I-art',
 'B-per',
 'I-per',
 'B-org',
 'B-geo',
 'B-tim',
 'PAD']

In [28]:
tags[0]

array([ 1,  1,  1,  1,  1,  1,  1,  1,  9,  1,  1,  1,  1,  1,  9,  1,  1,
        1,  1,  1,  3,  1,  1,  1,  1,  1, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11])

In [0]:
attention_masks = [[float(i != 0.0) for i in sent] for sent in input_ids]


In [30]:
len(attention_masks[0])

75

In [0]:
training_inputs, test_inputs, training_tags, test_tags = train_test_split(input_ids, tags,
                                                            random_state=42, test_size=0.1)
training_masks, test_masks, ignore1, ignore2 = train_test_split(attention_masks, input_ids,
                                             random_state=42, test_size=0.1)


In [0]:
training_inputs = torch.tensor(training_inputs)
training_tags = torch.tensor(training_tags)
training_masks = torch.tensor(training_masks)

test_inputs = torch.tensor(test_inputs)
test_tags = torch.tensor(test_tags)
test_masks = torch.tensor(test_masks)

In [0]:
training_data = TensorDataset(training_inputs, training_masks, training_tags)
training_sampler = RandomSampler(training_data)
training_dataloader = DataLoader(training_data, sampler=training_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [34]:
bert = BertForTokenClassification.from_pretrained("bert-base-cased",
    num_labels=len(tag2id),output_attentions = False, output_hidden_states = False
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




In [0]:
parameter_optimizer = list(bert.classifier.named_parameters())
optimizer_grouped_parameters = [{"params": [parameter for n, parameter in parameter_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)



In [0]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(bert.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(bert.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)



In [0]:
epochs = 2
max_grad_norm = 1.0
number_steps = len(training_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=number_steps
)

In [38]:
bert.cuda()
print()




In [39]:
for _ in trange(epochs, desc="Epoch"):
    bert.train()
    total_loss = 0

    #TRAINING
    
    for step, batch in enumerate(training_dataloader):
        
        #print(device)
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        #b_input_ids = torch.tensor(b_input_ids).to(torch.long)
        #b_input_mask = torch.tensor(b_input_mask).to(torch.long)
        #b_labels = torch.tensor(b_input_mask).to(torch.long)
        
        bert.zero_grad()
        
        outputs = bert(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        
        torch.nn.utils.clip_grad_norm_(parameters=bert.parameters(), max_norm=max_grad_norm)

        optimizer.step()
        scheduler.step()

    
    avg_training_loss = total_loss / len(training_dataloader)
    print("Average loss in training phase: ", avg_training_loss)

    
    #TESTING
    bert.eval()
    
    evaluation_loss, evaluation_accuracy = 0, 0
    nb_evaluation_steps, nb_evaluation_examples = 0, 0
    
    predictions = []
    actual_labels = []
    
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        #b_input_ids = torch.tensor(b_input_ids).to(torch.long)
        #b_input_mask = torch.tensor(b_input_mask).to(torch.long)
        #b_labels = torch.tensor(b_input_mask).to(torch.long)

        with torch.no_grad():
            outputs = bert(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        evaluation_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        actual_labels.extend(label_ids)

    evaluation_loss = evaluation_loss / len(test_dataloader)
    
    print("Validation loss: ", evaluation_loss)
    
    pred_tags = [vals_tag[p_i] for p, l in zip(predictions, actual_labels)
                                 for p_i, l_i in zip(p, l) if vals_tag[l_i] != "PAD"]
    
    test_tags = [vals_tag[l_i] for l in actual_labels
                                  for l_i in l if vals_tag[l_i] != "PAD"]
    
    print("Accuracy of Testing: ", accuracy_score(pred_tags, test_tags))
    print("F1-Score of Testing: ", (f1_score(pred_tags, test_tags)))
    print()

Epoch:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s]

Average loss in training phase:  2.3719520568847656
Validation loss:  1.7680237293243408
Accuracy of Testing:  0.7863247863247863
F1-Score of Testing:  0



Epoch: 100%|██████████| 2/2 [00:01<00:00,  1.63it/s]

Average loss in training phase:  1.5955572128295898
Validation loss:  1.427056074142456
Accuracy of Testing:  0.8205128205128205
F1-Score of Testing:  0




