In [None]:
#  !pip install transformers seqeval[gpu]

In [None]:
!pip install transformers==3.1.0

In [None]:
from pprint import pprint
from google.colab import drive

import regex as re
import pandas as pd
import numpy as np

from transformers import RobertaTokenizerFast, RobertaConfig, RobertaForTokenClassification

import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score

import glob
import os.path
import sys
import codecs

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
drive.mount('/content/drive', force_remount=True)

In [None]:
train_folder = "/content/drive/MyDrive/NLP/project_5_data/datasets/train-articles" 
dev_folder = "/content/drive/MyDrive/NLP/project_5_data/datasets/dev-articles"    
train_labels_file = "/content/drive/MyDrive/NLP/project_5_data/datasets/train-labels-task-si/"
dev_labels_file = "/content/drive/MyDrive/NLP/project_5_data/datasets/dev-labels-task-si"

In [None]:
def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    """
    Read articles from files matching patterns <file_pattern> from  
    the directory <folder_name>. 
    The content of the article is saved in the dictionary whose key
    is the id of the article (extracted from the file name).
    Each element of <sentence_list> is one line of the article.
    """
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read()
    return articles

In [None]:
dev_folder = "/content/drive/MyDrive/NLP/project_5_data/datasets/dev-articles" # check that the path to the datasets folder is correct, if not adjust these variables accordingly 
propaganda_techniques_file = "/content/drive/MyDrive/NLP/project_5_data/propaganda-techniques-scorer/data/propaganda-techniques-names-semeval2020task11.txt" # propaganda_techniques_file is in the tools.tgz file (download it from the team page)

In [None]:
# !python3 task-SI_scorer.py -s project_5_data/baseline-output-SI.txt -r project_5_data/datasets/dev-labels-task-si/

In [None]:
file_list = glob.glob(os.path.join(train_folder, "*.txt"))
train_articles_content, train_articles_id = ([], [])
for filename in file_list:
    with open(filename, "r", encoding="utf-8") as f:
        train_articles_content.append(' '.join([line.strip() for line in f]))
        train_articles_id.append(os.path.basename(filename).split(".")[0][7:])


In [None]:
#Load the training and dev articles
train_articles = read_articles_from_file_list(train_folder)
dev_articles = read_articles_from_file_list(dev_folder)

len(train_articles), len(dev_articles)

(371, 75)

In [None]:
TASK_3_ARTICLE_ID_COL = 0
#TASK_3_TECHNIQUE_NAME_COL = 1
TASK_3_FRAGMENT_START_COL = 1
TASK_3_FRAGMENT_END_COL = 2

def extract_article_id_from_file_name(fullpathfilename):

    regex = re.compile("article([0-9]+).*")
    return regex.match(os.path.basename(fullpathfilename)).group(1)

   
def load_annotation_list_from_folder(folder_name, techniques_names):

    file_list = glob.glob(os.path.join(folder_name, "*.labels"))
    if len(file_list)==0:
        print("Cannot load file list in folder " + folder_name)
        sys.exit()
    annotations = {}
    for filename in file_list:
        annotations[extract_article_id_from_file_name(filename)] = []
        with open(filename, "r") as f:
            for row_number, line in enumerate(f.readlines()):
                row = line.rstrip().split("\t")
                annotations[row[TASK_3_ARTICLE_ID_COL]].append((row[TASK_3_FRAGMENT_START_COL], row[TASK_3_FRAGMENT_END_COL]))

    return annotations

In [None]:
techniques_names = [ "propaganda" ]
train_annotation = load_annotation_list_from_folder(train_labels_file, techniques_names)

In [None]:
# Convert training data and labels into required format
train_labels = {}
for article in train_articles.keys():
    labels = [0] * len(train_articles[article])
    for annot in train_annotation[article]:
        labels[int(annot[0]):int(annot[1])+1] = [1] * (int(annot[1]) - int(annot[0]) + 1)
    train_labels[article] = labels

In [None]:
train_labels_str = {}
for article_id in train_articles.keys():
    index = 0
    word_index = 0
    # labels = [0] * len(train_articles[article_id].replace('\n\n',' ').split(' '))
    labels = [0] * len(train_articles[article_id].replace('\n\n',' ').replace('\n', ' ').strip().split(' '))
    labels_str = ['O'] * len(labels)

    for sentence in train_articles[article_id].replace('\n', ' ').strip().split('  '):
        for word in sentence.split(' '):
            if train_labels[article_id][index] == 1:
                labels[word_index] = 1
                labels_str[word_index] = 'I-Prop'
            word_index += 1
            index += len(word) + 1
        index += 1

    train_labels[article_id] = labels
    train_labels_str[article_id] = labels_str

In [None]:
len(train_labels), len(train_labels_str)

(371, 371)

In [None]:
train_df = pd.DataFrame(columns = ['ID','text','labels'])
count = 0
for article_id in train_articles.keys():
    train_df.loc[count,'ID'] = article_id
    train_df.loc[count,'text'] = train_articles[article_id].replace('\n\n',' ').replace('\n', ' ').strip()
    temp_label = [str(i) for i in train_labels_str[article_id]]
    train_df.loc[count,'labels'] = ','.join(temp_label)
    count+=1
# del train_df['ID']

In [None]:
new_train_df = pd.DataFrame(columns = train_df.columns)
count = 0
no_words = 256
for i in range(len(train_df)):
  start = 0
  while(True):
    text1 = ' '.join(train_df.loc[i,'text'].split(' ')[start:start+no_words])
    new_train_df.loc[count,'ID'] = train_df.loc[i,'ID']
    new_train_df.loc[count,'text'] = text1
    new_train_df.loc[count,'labels'] = ','.join(train_df.loc[i,'labels'].split(',')[start:start+no_words])
    count+=1
    start+=no_words-56
    if(start>=len(train_df.loc[i,'text'].split(' '))):
      break

In [None]:
new_train_df

Unnamed: 0,ID,text,labels
0,111111111,Next plague outbreak in Madagascar could be 's...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,111111111,"cases, and Tedros said Wednesday the death tol...","O,O,O,O,O,O,O,O,O,O,O,I-Prop,I-Prop,I-Prop,I-P..."
2,111111112,US bloggers banned from entering UK Two promin...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,111111112,"was murdered last month. Keith Vaz, chairman o...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,111111112,stir up more trouble. Britain doesn't need mor...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
...,...,...,...
1894,999001621,target for anyone wrapped up in Russia’s intel...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1895,999001621,the public about their scheme. --- This is a M...,"I-Prop,I-Prop,I-Prop,I-Prop,I-Prop,O,O,O,O,O,O..."
1896,999001970,"SNL Indian Comedian Silenced for ""Offensive Jo...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1897,999001970,"would have made of Richard Pryor, or at this p...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [None]:
len(new_train_df.loc[2,'text'].split(' ')), len(new_train_df.loc[2,'labels'].split(','))

(256, 256)

In [None]:
train_df = new_train_df
del train_df['ID']

In [None]:
# labels_to_ids = {k: v for v, k in enumerate(['B-Prop','I-Prop', 'O'])}
# ids_to_labels = {v: k for v, k in enumerate(['B-Prop','I-Prop', 'O'])}
# labels_to_ids, ids_to_labels

labels_to_ids = {k: v for v, k in enumerate(['O', 'I-Prop'])}
ids_to_labels = {v: k for v, k in enumerate(['O', 'I-Prop'])}
labels_to_ids, ids_to_labels

({'I-Prop': 1, 'O': 0}, {0: 'O', 1: 'I-Prop'})

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 1
DEV_BATCH_SIZE = 1
EPOCHS = 7
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.text[index].strip() 
        word_labels = self.data.labels[index].split(",") 


        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        encoding = self.tokenizer(sentence,
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 

        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
        tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
        flag = True
        for idx in range(len(encoding['input_ids'])):
            if encoding['offset_mapping'][idx][0] == 0 and encoding['offset_mapping'][idx][1] != 0 and flag:
                encoded_labels[idx] = labels[i]
                i += 1
                flag = False
            elif tokens[idx][0] == 'Ġ' and len(tokens[idx]) > 1:
                encoded_labels[idx] = labels[i]
                i += 1
        
        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [None]:
train_size = 1
train_dataset = train_df.sample(frac=train_size,random_state=200)
test_dataset = train_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (1899, 2)
TRAIN Dataset: (1899, 2)
TEST Dataset: (0, 2)


In [None]:
training_set[1209]

In [None]:
print(tokenizer.decode(training_set[1209]["input_ids"].tolist()))

<s>Las Vegas Shooting: A THIRD Timeline Emerges Editor's Note: This is probably a reason why reporters Laura Loomer and Mike Tokes were barred from entering the presser. They would have posed real genuine questions of Sheriff Lombardo and forced him to stumble all over himself in answering them. The words “conspiracy theorists” are being bandied about. Are they for real? A well-coordinated, meticulously planned attack on concertgoers leads to the murder of 58 Americans at a country music festival in Las Vegas, with over 500 injured, and they have no explanation or motive. Do the FBI and law enforcement think people won’t talk about it or speculate as to what happened? Are they for real? Further, the FBI insists there is no jihad motive, while saying they don’t know his motive. How can they possibly hold those two contradictory ideas at the same time? The sheriff, alluding to allegations of a conspiracy between his department, the F.B.I., and MGM — supposedly in an effort to establish a

In [None]:
for token, mapping, label in zip(tokenizer.convert_ids_to_tokens(training_set[1209]["input_ids"]), training_set[1209]['offset_mapping'], training_set[1209]["labels"]):
    print('{0:10} {1:4}  {2:15}'.format(token, label.item(), str(mapping.tolist())))
    

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
# testing_loader = DataLoader(testing_set, **test_params)

In [None]:
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=len(labels_to_ids))
model.to(device)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [None]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss, outputs

(tensor(0.9811, device='cuda:0', grad_fn=<NllLossBackward0>),
 (tensor(0.9811, device='cuda:0', grad_fn=<NllLossBackward0>),
  tensor([[[-0.0923,  0.1653],
           [-0.0453,  0.0106],
           [-0.2187,  0.1765],
           ...,
           [-0.2563,  0.2377],
           [-0.2563,  0.2377],
           [-0.2563,  0.2377]]], device='cuda:0', grad_fn=<AddBackward0>)))

In [None]:
input_ids.shape

torch.Size([1, 512])

In [None]:
labels.shape

torch.Size([1, 512])

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 512, 2])

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# Defining the training function on dataset
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
torch.cuda.empty_cache()

In [None]:
for epoch in range(7):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

In [None]:
torch.save(model, "/content/drive/MyDrive/NLP/RoBERTa_Task_SI.pt")
# model = torch.load("/content/drive/MyDrive/NLP/RoBERTa_Task_SI.pt", map_location=torch.device('cpu'))
# model.eval()

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            # temp_labels = labels
            # print(idx,labels)
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if (idx+1) % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            # print(temp_labels == labels)

            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            # eval_labels.extend(flattened_targets)
            # eval_preds.extend(flattened_predictions)
            
            # tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
#Convert the dev data into the required format
dev_annotation = load_annotation_list_from_folder(dev_labels_file, techniques_names)

dev_labels = {}
for article in dev_articles.keys():
    labels = [0] * len(dev_articles[article])
    for annot in dev_annotation[article]:
        labels[int(annot[0]):int(annot[1])] = [1] * (int(annot[1]) - int(annot[0]))
    dev_labels[article] = labels


In [None]:
dev_labels_str = {}
for article_id in dev_articles.keys():
    index = 0
    word_index = 0

    labels = [0] * len(dev_articles[article_id].replace('\n\n','\n').replace('\n', ' ').strip().split(' '))
    labels_str = ['O'] * len(labels)
    
    first_sentence = True
    for sentence in dev_articles[article_id].replace('\n\n', '\n').strip().split('\n'):
        for word in sentence.split(' '):
            if dev_labels[article_id][index] == 1:
                labels[word_index] = 1
                labels_str[word_index] = 'I-Prop'
            word_index += 1
            index += len(word) + 1
        if first_sentence:
            first_sentence = False
            index += 1
        # index += 1

    dev_labels[article_id] = labels
    dev_labels_str[article_id] = labels_str


In [None]:
dev_df = pd.DataFrame(columns = ['ID','text','labels'])
count = 0
for article_id in dev_articles.keys():
    dev_df.loc[count,'ID'] = article_id
    dev_df.loc[count,'text'] = dev_articles[article_id].replace('\n\n','\n').replace('\n', ' ').strip()
    temp_label = [label for label in dev_labels_str[article_id]]
    dev_df.loc[count,'labels'] = ','.join(temp_label)
    count+=1


In [None]:
new_dev_df = pd.DataFrame(columns = dev_df.columns)
count = 0
no_words = 256
for i in range(len(dev_df)):
    start = 0
    while(True):
        text1 = ' '.join(dev_df.loc[i,'text'].split(' ')[start:start+no_words])
        new_dev_df.loc[count,'ID'] = dev_df.loc[i,'ID']
        new_dev_df.loc[count,'text'] = text1
        new_dev_df.loc[count,'labels'] = ','.join(dev_df.loc[i,'labels'].split(',')[start:start+no_words])
        count+=1
        start+=no_words-56
        if(start>=len(dev_df.loc[i,'text'].split(' '))):
            break

dev_df = new_dev_df
dev_id = dev_df['ID']
del dev_df['ID']

In [None]:
dev_df

In [None]:
#All data in dev set
dev_dataset = dev_df

dev_set = dataset(dev_dataset, tokenizer, MAX_LEN)
dev_params = {'batch_size': 1,
                'shuffle': False,
                'num_workers': 0
                }


dev_loader = DataLoader(dev_set, **dev_params)

In [None]:
# Run predictions 1 by 1
prediction_map = {}
actual_labels_map = {}

for i in range(len(dev_df)):
    print(i)
    single_dev_row = pd.DataFrame(columns = ['text','labels'])
    single_dev_row.loc[0, 'text'] = dev_df.loc[i].text
    single_dev_row.loc[0, 'labels'] = dev_df.loc[i].labels

    dev_set = dataset(single_dev_row, tokenizer, 512)
    dev_loader = DataLoader(dev_set, **dev_params)
    labels, predictions = valid(model, dev_loader)
    if dev_id[i] not in prediction_map:
        prediction_map[dev_id[i]] = []
    prediction_map[dev_id[i]].append(predictions)
    if dev_id[i] not in actual_labels_map:
        actual_labels_map[dev_id[i]] = []
    actual_labels_map[dev_id[i]].append(labels)
    print()

In [None]:
labels_actual, predictions = valid(model, dev_loader)

In [None]:
total_word_len = []
for article in dev_df.text:
    total_word_len.append(len(article.split(' ')))
np.mean(total_word_len), len(dev_df.text)

(215.66969696969696, 330)

In [None]:
correct_word_prediction = 0
for index in range(len(labels)):
  if labels[index] == predictions[index] and labels[index] is not 'O':
    correct_word_prediction+=1
    print(index, labels[index])

print(correct_word_prediction)

14 I-Prop
15 I-Prop
2


In [None]:
task_SI_output_file = '/content/drive/MyDrive/NLP/project_5_data/roberta_base_SI_output_256tokens_e7.txt'

In [None]:
prediction_labels_map = {}
for article_id in dev_articles.keys():
    prediction_list = prediction_map[article_id]
    pred_labels_for_id = [0] * (200 * (len(prediction_list) - 1) + len(prediction_list[-1]))
    index = 0
    for predictions in prediction_list:
        for label in predictions:
            if label == 'I-Prop':
                pred_labels_for_id[index] = 1
            else:
                pred_labels_for_id[index] = 0
            index += 1
        index = index - 56
    prediction_labels_map[article_id] = pred_labels_for_id
    print("Article {} - Predicted 'I-Prop' tokens {} ".format(article_id, pred_labels_for_id.count(1)))

In [None]:
with open(task_SI_output_file, "w") as fout:

    for dev_article_id in dev_articles.keys():
        print("Processing Article:", dev_article_id)
        article_text = dev_articles[dev_article_id]
        pred_labels = prediction_labels_map[dev_article_id]

        print("Word count in article:", len(article_text.replace('\n\n','\n').replace('\n', ' ').strip().split(' ')))

        char_index = 0
        span_started = False
        span_len = 0
        label_index = 0
        idx = 0

        while idx < len(article_text) and label_index < len(pred_labels):
            if (article_text[idx:idx+2] == '\n\n') and span_started:
                idx += 2
                label_index += 1
                span_len += 2
                continue
            elif (article_text[idx:idx+2] == '\n\n'):
                idx += 2
                label_index +=1
                continue
            if (article_text[idx] == ' ' or article_text[idx] == '\n') and span_started:
                label_index += 1
                span_len += 1
                idx += 1
                continue
            elif article_text[idx] == ' ' or article_text[idx] == '\n':
                label_index += 1
                idx += 1
                continue
                        
            if pred_labels[label_index] == 1 and not span_started:
                span_started = True
                start = idx
                span_len += 1
            elif pred_labels[label_index] == 1 and span_started:
                span_len += 1
            elif pred_labels[label_index] == 0 and span_started:
                span_started = False          
                if span_len > 0:
                    fout.write("%s\t%s\t%s\n" % (dev_article_id, start, start+span_len))
                    span_len = 0
            idx += 1
            
        print("Prediction length for article {0:9}:{1:5}".format(dev_article_id, label_index))
        print("Done Processing\n")
    print("Predictions writted to file: ", task_SI_output_file)

Word count in article: 289
Prediction length for article 730081389:  289
Done Processing

Word count in article: 474
Prediction length for article 730093263:  474
Done Processing

Word count in article: 901
Prediction length for article 730246508:  901
Done Processing

Word count in article: 141
Prediction length for article 730269378:  141
Done Processing

Word count in article: 404
Prediction length for article 738028498:  404
Done Processing

Word count in article: 650
Prediction length for article 738361208:  650
Done Processing

Word count in article: 372
Prediction length for article 738442776:  372
Done Processing

Word count in article: 1055
Prediction length for article 738447109: 1055
Done Processing

Word count in article: 321
Prediction length for article 738542398:  321
Done Processing

Word count in article: 390
Prediction length for article 738781754:  390
Done Processing

Word count in article: 582
Prediction length for article 740235127:  582
Done Processing

Word coun

In [None]:
#Compute the Scores
%cd /content/drive/MyDrive/NLP/
!python3 task-SI_scorer.py -s project_5_data/roberta_base_SI_output_256tokens_e7.txt -r project_5_data/datasets/dev-labels-task-si/