In [1]:
!pip install transformers==3.1.0

Collecting transformers==3.1.0
  Downloading transformers-3.1.0-py3-none-any.whl (884 kB)
[K     |████████████████████████████████| 884 kB 5.1 MB/s 
[?25hCollecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 7.3 MB/s 
Collecting tokenizers==0.8.1.rc2
  Downloading tokenizers-0.8.1rc2-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 9.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 9.3 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=b0a5dd12578c83a6d33d28408359361a5115684e21f9f03918f5140cd0b9116a
  Stored in directory: /root/.cache/pip/wheels/87/39/dd/a83eeef36d0bf98e7a4

In [2]:
import glob
import os.path
import codecs
import torch
import numpy as np

from google.colab import drive
from torch import cuda, tensor
from sklearn import preprocessing
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [3]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [6]:
train_folder = "/content/drive/MyDrive/NLP/project_5_data/datasets/train-articles" 
dev_folder = "/content/drive/MyDrive/NLP/project_5_data/datasets/dev-articles"     
train_labels_file = "/content/drive/MyDrive/NLP/project_5_data/datasets/train-task-flc-tc.labels"

dev_template_labels_file = "/content/drive/MyDrive/NLP/project_5_data/datasets/dev-task-flc-tc.labels"
task_TC_output_file = "/content/drive/MyDrive/NLP/bert_no_wordnet_TC_e3_multilabel.txt"

In [7]:
def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read()
    return articles

In [8]:
def read_predictions_from_file(filename):
    articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
            articles_id.append(article_id)
            gold_labels.append(gold_label)
            span_starts.append(span_start)
            span_ends.append(span_end)
    return articles_id, span_starts, span_ends, gold_labels

In [9]:
def get_texts(articles_map, article_ids, span_starts, span_ends):
    texts = []
    for article_id, sp_start, sp_end in zip(article_ids, span_starts, span_ends):
        sentence = articles_map.get(article_id)[int(sp_start):int(sp_end)]
        texts.append(sentence)
    return texts

In [10]:
le = preprocessing.LabelEncoder()

articles_map = read_articles_from_file_list(train_folder)
dev_articles_map = read_articles_from_file_list(dev_folder) 

In [11]:
len(articles_map), len(dev_articles_map), len(set(articles_map.keys())), len(set(dev_articles_map.keys()))

(371, 75, 371, 75)

In [12]:
# reading data from the train set
train_articles_ids, train_span_starts, train_span_ends, train_labels = read_predictions_from_file(train_labels_file)
print("Loaded training %d annotations from %d train articles" % (len(train_span_starts), len(set(train_articles_ids))))
train_texts = get_texts(articles_map, train_articles_ids, train_span_starts, train_span_ends)

# reading data from the development set
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)
print("Loaded dev %d annotations from %d dev articles" % (len(dev_span_starts), len(set(dev_article_ids))))
dev_texts = get_texts(dev_articles_map, dev_article_ids, dev_span_starts, dev_span_ends)

Loaded training 6128 annotations from 357 train articles
Loaded dev 1063 annotations from 74 dev articles


In [13]:
len(train_texts), len(dev_texts)

(6128, 1063)

In [14]:
#label_encoding data for the labels
le.fit(train_labels)
label_to_class_map = {label: clas for label,clas in zip(le.transform(le.classes_), le.classes_)}
class_to_label_map = {clas: label  for label,clas in zip(le.transform(le.classes_), le.classes_)}

train_labels_encoded = le.transform(train_labels)
dev_labels_encoded = le.transform(dev_labels)

In [None]:
label_to_class_map, class_to_label_map

({0: 'Appeal_to_Authority',
  1: 'Appeal_to_fear-prejudice',
  2: 'Bandwagon,Reductio_ad_hitlerum',
  3: 'Black-and-White_Fallacy',
  4: 'Causal_Oversimplification',
  5: 'Doubt',
  6: 'Exaggeration,Minimisation',
  7: 'Flag-Waving',
  8: 'Loaded_Language',
  9: 'Name_Calling,Labeling',
  10: 'Repetition',
  11: 'Slogans',
  12: 'Thought-terminating_Cliches',
  13: 'Whataboutism,Straw_Men,Red_Herring'},
 {'Appeal_to_Authority': 0,
  'Appeal_to_fear-prejudice': 1,
  'Bandwagon,Reductio_ad_hitlerum': 2,
  'Black-and-White_Fallacy': 3,
  'Causal_Oversimplification': 4,
  'Doubt': 5,
  'Exaggeration,Minimisation': 6,
  'Flag-Waving': 7,
  'Loaded_Language': 8,
  'Name_Calling,Labeling': 9,
  'Repetition': 10,
  'Slogans': 11,
  'Thought-terminating_Cliches': 12,
  'Whataboutism,Straw_Men,Red_Herring': 13})

In [15]:
class TC_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=14)
model.to(device)

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [17]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=512)

train_dataset = TC_Dataset(train_encodings, train_labels_encoded)
dev_dataset = TC_Dataset(dev_encodings, dev_labels_encoded)

In [None]:
training_args = TrainingArguments(
    output_dir= '/content/drive/MyDrive/NLP',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=16,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir= '/content/drive/MyDrive/NLP',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=dev_dataset)

trainer.train()

trainer.evaluate()

In [None]:
# torch.save(model.state_dict(), "/content/drive/MyDrive/NLP/BERT_Task_TC_sd.pt")
# model = BertForSequenceClassification()
# model.load_state_dict(torch.load("content/drive/MyDrive/NLP/BERT_Task_TC_sd.pt"))
# model.eval()

torch.save(model, "/content/drive/MyDrive/NLP/BERT_Task_TC.pt")
# Model class must be defined somewhere
# model = torch.load("/content/drive/MyDrive/NLP/BERT_Task_TC.pt", map_location=torch.device('cpu'))
# model.eval()

In [None]:
def stats(dev_article_ids, dev_span_starts, dev_span_ends):
    count_map = {}
    for id, start, end in zip(dev_article_ids, dev_span_starts, dev_span_ends):
        if id not in count_map:
            count_map[id] = {}
        if start not in count_map[id]:
            count_map[id][start] = {}
        if end not in count_map[id][start]:
            count_map[id][start][end] = 1
        else:
            count_map[id][start][end] += 1

    # return_map = {}
    for id in count_map.keys():
        for start in count_map[id].keys():
            for end in count_map[id][start].keys():
                if count_map[id][start][end] > 1:
                    print((id, start, end, count_map[id][start][end]))
                    # return_map[id] = (start, end, count_map[id][start][end])
    return count_map

In [None]:
# Multi-Label Classification - ONLY DEV Dataset
# TC_scorer required Multi-Label classificication in certain format 
# Also the scorer requires fixed number of predictions and writing more labels will throw an error

model.eval()
result = 0
predictions = []
sigmoid = torch.nn.Sigmoid()

multi_label_id_span_map = stats(dev_article_ids, dev_span_starts, dev_span_ends)
processed_article_ids = []

print("\n\n")
print("Running Model for Classification")
with torch.no_grad():
    result = model(dev_dataset[0:len(dev_dataset)]['input_ids'].to(device),
                    attention_mask=dev_dataset[0:len(dev_dataset)]['attention_mask'].to(device),
                    labels=dev_dataset[0:len(dev_dataset)]['labels'].to(device))
    
    logits = result[1]
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)

final_predictions = []
predictions = np.concatenate(predictions, axis=0)

print("Computing Predictions and Writing output to file")
with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(dev_article_ids, predictions, dev_span_starts, dev_span_ends):

        pred_probas = sigmoid(tensor(prediction))
        #Get multiple labels
        if multi_label_id_span_map[article_id][span_start][span_end] > 1 and article_id+str(span_start)+str(span_end) not in processed_article_ids:
            processed_article_ids.append(article_id+str(span_start)+str(span_end))
            
            count = multi_label_id_span_map[article_id][span_start][span_end]

            pred_labels = np.argsort(pred_probas.tolist())[-1:-(1+count):-1]
            predictions_for_article_id = [label_to_class_map[label.item()] for label in pred_labels]

            for pred_class in predictions_for_article_id:
                final_predictions.append(pred_class)
                fout.write("%s\t%s\t%s\t%s\n" % (article_id, pred_class, span_start, span_end))
        #Get the highest scoring label if none of the labels pass the cutoff
        elif article_id+str(span_start)+str(span_end) not in processed_article_ids:
            pred_label = np.argmax(pred_probas)
            pred_class = label_to_class_map[pred_label.item()]
            final_predictions.append(pred_class)
            fout.write("%s\t%s\t%s\t%s\n" % (article_id, pred_class, span_start, span_end))

print("Sample final_predictions:\n", final_predictions[0:5])
print(len(final_predictions))
print("Predictions written to file " + task_TC_output_file)

In [None]:
#Uncomment Below cells to perform single label classification. Refer the cell further below for True Multi-Label Classification

In [None]:
# #Single Label Classification - All Cases

# model.eval()
# result = 0
# predictions, true_labels = [], []

# print("Running the model for classification")
# with torch.no_grad():
#     result = model(dev_dataset[0:len(dev_dataset)]['input_ids'].to(device),
#                     attention_mask=dev_dataset[0:len(dev_dataset)]['attention_mask'].to(device),
#                     labels=dev_dataset[0:len(dev_dataset)]['labels'].to(device))
    
#     logits = result[1]

#     logits = logits.detach().cpu().numpy()
#     true_label_ids = dev_dataset[0:len(dev_dataset)]['labels'].to('cpu').numpy()

#     # Store predictions and true labels
#     predictions.append(logits)
#     true_labels.append(true_label_ids)

# print("Computing Predictions")
# #Flatten the Prediction across batches and find the one with highest probability
# flat_predictions_sl = np.concatenate(predictions, axis=0)
# flat_predictions_sl = np.argmax(flat_predictions_sl, axis=1).flatten()

# flat_true_labels = np.concatenate(true_labels, axis=0)

# print("Writing output to file")
# #Single Label Classification write to file
# with open(task_TC_output_file, "w") as fout:
#     for article_id, prediction, span_start, span_end in zip(dev_article_ids, flat_predictions_sl, dev_span_starts, dev_span_ends):
#         fout.write("%s\t%s\t%s\t%s\n" % (article_id, label_to_class_map[prediction], span_start, span_end))
# print("Predictions written to file " + task_TC_output_file)

In [None]:
# #Test the Accuracy of predictions
# flat_predictions_sl.shape, flat_true_labels.shape

# count = 0
# for i in range(len(flat_predictions_sl)):
#   if flat_predictions_sl[i] == flat_true_labels[i]:
#     count += 1

# print(count/len(flat_true_labels))

In [None]:
#Uncomment below cells to perform Multi-label classification

In [None]:
# # Multi Label Classification - All cases
# # Get all predictions over a certain cut-off
# model.eval()
# result = 0
# predictions = []
# sigmoid = torch.nn.Sigmoid()

# print("Running the model for multi-label classification")
# with torch.no_grad():
#     result = model(dev_dataset[0:len(dev_dataset)]['input_ids'].to(device),
#                     attention_mask=dev_dataset[0:len(dev_dataset)]['attention_mask'].to(device),
#                     labels=dev_dataset[0:len(dev_dataset)]['labels'].to(device))
    
#     logits = result[1]
#     logits = logits.detach().cpu().numpy()
#     predictions.append(logits)

# final_predictions = []
# predictions = np.concatenate(predictions, axis=0)

# print("Computing Predictions ")
# for prediction in predictions:
#     pred_probas = sigmoid(tensor(prediction))

#     pred_probas_cutoff = np.zeros(pred_probas.shape)
#     pred_probas_cutoff[np.where(pred_probas >= 0.95)] = 1

#     if sum(pred_probas_cutoff) == 0:
#         pred_probas_cutoff[np.argmax(pred_probas)] = 1
    
#     predicted_classes = [label_to_class_map[idx] for idx, label in enumerate(pred_probas_cutoff) if label == 1.0]
#     final_predictions.append(predicted_classes)

# print("Sample final_predictions:\n", final_predictions[0:7])

# print("Writing Output to File")
# with open(task_TC_output_file, "w") as fout:
#     for article_id, prediction_list, span_start, span_end in zip(dev_article_ids, final_predictions, dev_span_starts, dev_span_ends):
#         for prediction in prediction_list:
#             fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))

# print("Predictions written to file " + task_TC_output_file)

In [None]:
#Scorer code, File paths may vary. Download the file prediction written and please follow the instructions in readme file to get scores

In [None]:
# %cd /content/drive/MyDrive/NLP/

# !python3 project_5_data/propaganda-techniques-scorer/task-TC_scorer.py -s roberta_TC_ML_iter2.txt -r project_5_data/propaganda-techniques-scorer/dev-task-flc-tc.labels -p project_5_data/propaganda-techniques-scorer/propaganda-techniques-names-semeval2020task11.txt