In [None]:
!pip install transformers==3.1.0

In [17]:
import glob
import os.path
import codecs
import torch

import numpy as np
from google.colab import drive
from torch import cuda, tensor
from sklearn import preprocessing
from transformers import Trainer, TrainingArguments, RobertaForSequenceClassification, RobertaTokenizer


In [3]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [5]:
train_folder = "/content/drive/MyDrive/NLP/project_5_data/datasets/train-articles" 
dev_folder = "/content/drive/MyDrive/NLP/project_5_data/datasets/dev-articles"     
train_labels_file = "/content/drive/MyDrive/NLP/project_5_data/datasets/train-task-flc-tc.labels"
# dev_template_labels_file = "../datasets/test-task-tc-template.out"
dev_template_labels_file = "/content/drive/MyDrive/NLP/project_5_data/datasets/dev-task-flc-tc.labels"
task_TC_output_file = "/content/drive/MyDrive/NLP/roberta_TC_final.txt"

In [6]:
def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read()
    return articles

In [7]:
def read_predictions_from_file(filename):
    articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
            articles_id.append(article_id)
            gold_labels.append(gold_label)
            span_starts.append(span_start)
            span_ends.append(span_end)
    return articles_id, span_starts, span_ends, gold_labels

In [8]:
def get_texts(articles_map, article_ids, span_starts, span_ends):
    texts = []
    for article_id, sp_start, sp_end in zip(article_ids, span_starts, span_ends):
        sentence = articles_map.get(article_id)[int(sp_start):int(sp_end)]
        # texts.append(sentence.lower())
        texts.append(sentence)
    return texts

In [9]:
le = preprocessing.LabelEncoder()

articles_map = read_articles_from_file_list(train_folder)
dev_articles_map = read_articles_from_file_list(dev_folder) 

In [None]:
len(articles_map), len(dev_articles_map), len(set(articles_map.keys())), len(set(dev_articles_map.keys()))

In [None]:
# reading data from the train set
train_articles_ids, train_span_starts, train_span_ends, train_labels = read_predictions_from_file(train_labels_file)
print("Loaded training %d annotations from %d train articles" % (len(train_span_starts), len(set(train_articles_ids))))
train_texts = get_texts(articles_map, train_articles_ids, train_span_starts, train_span_ends)

# reading data from the development set
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)
print("Loaded dev %d annotations from %d dev articles" % (len(dev_span_starts), len(set(dev_article_ids))))
dev_texts = get_texts(dev_articles_map, dev_article_ids, dev_span_starts, dev_span_ends)

In [12]:
len(train_texts), len(dev_texts)

(6128, 1063)

In [13]:
#label_encoding data for the labels
le.fit(train_labels)
label_to_class_map = {label: clas for label,clas in zip(le.transform(le.classes_), le.classes_)}
class_to_label_map = {clas: label  for label,clas in zip(le.transform(le.classes_), le.classes_)}

train_labels_encoded = le.transform(train_labels)
dev_labels_encoded = le.transform(dev_labels)

In [14]:
class TC_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=14)
model.to(device)

# for param in model.base_model.parameters():
#     param.requires_grad = False

In [16]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=512)

train_dataset = TC_Dataset(train_encodings, train_labels_encoded)
dev_dataset = TC_Dataset(dev_encodings, dev_labels_encoded)

In [None]:
training_args = TrainingArguments(
    output_dir= '/content/drive/MyDrive/NLP',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=16,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir= '/content/drive/MyDrive/NLP',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=dev_dataset)
trainer.train()

trainer.evaluate()

In [None]:
# torch.save(model.state_dict(), "/content/drive/MyDrive/NLP/RoBERTa_Task_TC_sd.pt")
# model = BertForSequenceClassification()
# model.load_state_dict(torch.load("/content/drive/MyDrive/NLP/RoBERTa_Task_TC_sd.pt"))
# model.eval()

torch.save(model, "/content/drive/MyDrive/NLP/RoBERTa_Task_TC.pt")
# model = torch.load("/content/drive/MyDrive/NLP/RoBERTa_Task_TC.pt", map_location=torch.device('cpu'))
# model.eval()

In [18]:
def stats(dev_article_ids, dev_span_starts, dev_span_ends):
    count_map = {}
    for id, start, end in zip(dev_article_ids, dev_span_starts, dev_span_ends):
        if id not in count_map:
            count_map[id] = {}
        if start not in count_map[id]:
            count_map[id][start] = {}
        if end not in count_map[id][start]:
            count_map[id][start][end] = 1
        else:
            count_map[id][start][end] += 1

    # return_map = {}
    for id in count_map.keys():
        for start in count_map[id].keys():
            for end in count_map[id][start].keys():
                if count_map[id][start][end] > 1:
                    print((id, start, end, count_map[id][start][end]))
                    # return_map[id] = (start, end, count_map[id][start][end])
    return count_map

In [None]:
# Multi-Label Classification - ONLY DEV Dataset
# TC_scorer required Multi-Label classificication in certain format 
# Also the scorer requires fixed number of predictions and writing more labels will throw an error

model.eval()
result = 0
predictions = []
sigmoid = torch.nn.Sigmoid()

multi_label_id_span_map = stats(dev_article_ids, dev_span_starts, dev_span_ends)
processed_article_ids = []

print("\n\n")
print("Running Model for Classification")
with torch.no_grad():
    result = model(dev_dataset[0:len(dev_dataset)]['input_ids'].to(device),
                    attention_mask=dev_dataset[0:len(dev_dataset)]['attention_mask'].to(device),
                    labels=dev_dataset[0:len(dev_dataset)]['labels'].to(device))
    
    logits = result[1]
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)

final_predictions = []
predictions = np.concatenate(predictions, axis=0)

print("Computing Predictions and Writing output to file")
with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(dev_article_ids, predictions, dev_span_starts, dev_span_ends):

        pred_probas = sigmoid(tensor(prediction))
        #Get multiple labels
        if multi_label_id_span_map[article_id][span_start][span_end] > 1 and article_id+str(span_start)+str(span_end) not in processed_article_ids:
            processed_article_ids.append(article_id+str(span_start)+str(span_end))
            
            count = multi_label_id_span_map[article_id][span_start][span_end]

            pred_labels = np.argsort(pred_probas.tolist())[-1:-(1+count):-1]
            predictions_for_article_id = [label_to_class_map[label.item()] for label in pred_labels]

            for pred_class in predictions_for_article_id:
                final_predictions.append(pred_class)
                fout.write("%s\t%s\t%s\t%s\n" % (article_id, pred_class, span_start, span_end))
        #Get the highest scoring label if none of the labels pass the cutoff
        elif article_id+str(span_start)+str(span_end) not in processed_article_ids:
            pred_label = np.argmax(pred_probas)
            pred_class = label_to_class_map[pred_label.item()]
            final_predictions.append(pred_class)
            fout.write("%s\t%s\t%s\t%s\n" % (article_id, pred_class, span_start, span_end))

print("Sample final_predictions:\n", final_predictions[0:5])
print(len(final_predictions))
print("Predictions written to file " + task_TC_output_file)

In [None]:
#Uncomment Below cells to perform single label classification. Refer the cell further below for True Multi-Label Classification

In [None]:
# #Single Label Classification

model.eval()
result = 0
predictions,  true_labels = [], []

with torch.no_grad():
  result = model(dev_dataset[0:len(dev_dataset)]['input_ids'].to(device),
                   attention_mask=dev_dataset[0:len(dev_dataset)]['attention_mask'].to(device),
                   labels=dev_dataset[0:len(dev_dataset)]['labels'].to(device))
  
logits = result[1]

logits = logits.detach().cpu().numpy()
true_label_ids = dev_dataset[0:len(dev_dataset)]['labels'].to('cpu').numpy()
  
# Store predictions and true labels
predictions.append(logits)
true_labels.append(true_label_ids)



# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)


print(flat_predictions.shape, flat_true_labels.shape)

count = 0
for i in range(len(flat_predictions)):
  if flat_predictions[i] == flat_true_labels[i]:
    count += 1

print(count/len(flat_true_labels))
label_to_class_map[flat_predictions[0]]

In [None]:
# #Single Label Classification
# # Write results to file
# with open("/content/drive/MyDrive/NLP/roberta_TC_final.txt", "w") as fout:
#     for article_id, prediction, span_start, span_end in zip(dev_article_ids, flat_predictions, dev_span_starts, dev_span_ends):
#         fout.write("%s\t%s\t%s\t%s\n" % (article_id, label_to_class_map[prediction], span_start, span_end))
# print("Predictions written to file " + "/content/drive/MyDrive/NLP/roberta_TC_final.txt")

In [None]:
#Uncomment Below cells to perform true multi-label classification. May cause scoring issues if run on dev dataset and the file generated is used for scorer.

In [22]:
# # Actual Multi-label Predictions
# # Multi Label Classification - not Dev data case
# # Get all predictions over a certain cut-off
# model.eval()
# result = 0
# predictions = []
# sigmoid = torch.nn.Sigmoid()

# print("Running the model for multi-label classification")
# with torch.no_grad():
#     result = model(dev_dataset[0:len(dev_dataset)]['input_ids'].to(device),
#                     attention_mask=dev_dataset[0:len(dev_dataset)]['attention_mask'].to(device),
#                     labels=dev_dataset[0:len(dev_dataset)]['labels'].to(device))
    
#     logits = result[1]
#     logits = logits.detach().cpu().numpy()
#     predictions.append(logits)

# final_predictions = []
# predictions = np.concatenate(predictions, axis=0)

# print("Computing Predictions ")
# for prediction in predictions:
#     pred_probas = sigmoid(tensor(prediction))

#     pred_probas_cutoff = np.zeros(pred_probas.shape)
#     pred_probas_cutoff[np.where(pred_probas >= 0.95)] = 1

#     if sum(pred_probas_cutoff) == 0:
#         pred_probas_cutoff[np.argmax(pred_probas)] = 1
    
#     predicted_classes = [label_to_class_map[idx] for idx, label in enumerate(pred_probas_cutoff) if label == 1.0]
#     final_predictions.append(predicted_classes)

# print("Sample final_predictions:\n", final_predictions[0:7])

# print("Writing Output to File")
# with open(task_TC_output_file, "w") as fout:
#     for article_id, prediction_list, span_start, span_end in zip(dev_article_ids, final_predictions, dev_span_starts, dev_span_ends):
#         for prediction in prediction_list:
#             fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))

# print("Predictions written to file " + task_TC_output_file)

Running the model for multi-label classification
Computing Predictions 
Sample final_predictions:
 [['Name_Calling,Labeling'], ['Name_Calling,Labeling']]
Writing Output to File
Predictions written to file /content/drive/MyDrive/NLP/results/roberta_TC_final.txt


In [None]:
#Scorer code, File paths may vary. Download the file prediction written and please follow the instructions in readme file to get scores

In [4]:
# %cd /content/drive/MyDrive/NLP/

# !python3 project_5_data/propaganda-techniques-scorer/task-TC_scorer.py -s roberta_TC_ML_iter2.txt -r project_5_data/propaganda-techniques-scorer/dev-task-flc-tc.labels -p project_5_data/propaganda-techniques-scorer/propaganda-techniques-names-semeval2020task11.txt

/content/drive/MyDrive/NLP
2022-05-13 23:01:29,419 - INFO - Checking format: User Predictions -- Gold Annotations
2022-05-13 23:01:29,421 - INFO - OK: submission file format appears to be correct
2022-05-13 23:01:29,461 - INFO - Scoring submission
F1=0.614299
Precision=0.614299
Recall=0.614299
F1_Appeal_to_Authority=0.14285714285714285
F1_Appeal_to_fear-prejudice=0.3908045977011494
F1_Bandwagon,Reductio_ad_hitlerum=0.42857142857142855
F1_Black-and-White_Fallacy=0.23529411764705885
F1_Causal_Oversimplification=0.4878048780487805
F1_Doubt=0.5467625899280576
F1_Exaggeration,Minimisation=0.48366013071895425
F1_Flag-Waving=0.7471264367816093
F1_Loaded_Language=0.7785817655571636
F1_Name_Calling,Labeling=0.7002652519893899
F1_Repetition=0.411522633744856
F1_Slogans=0.5714285714285715
F1_Thought-terminating_Cliches=0.20000000000000004
F1_Whataboutism,Straw_Men,Red_Herring=0.13333333333333333

