In [0]:
# Install packages

!pip install torch
#install nvidia apex
!git clone https://github.com/NVIDIA/apex
!pip install --user --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" apex/
!pip install pytorch-pretrained-bert --user

In [0]:
# Download dataset

!wget https://propaganda.qcri.org/nlp4if-shared-task/data/datasets.tgz
!tar xfz datasets.tgz

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import pkg_resources
import seaborn as sns
import time
import scipy.stats as stats
import gc
import re
import operator 
import sys
from sklearn import metrics
from sklearn import model_selection
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from nltk.stem import PorterStemmer
from sklearn.metrics import roc_auc_score
%load_ext autoreload
%autoreload 2
%matplotlib inline
from tqdm import tqdm, tqdm_notebook
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')
import pickle
from apex import amp
import shutil
import pandas as pd
import json,requests


from sklearn.metrics import f1_score

In [0]:
import glob
import os.path
import numpy as np
import random
import sys

from sklearn.linear_model import LogisticRegression

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam

# This is the Bert configuration file
from pytorch_pretrained_bert import BertConfig

In [0]:
device=torch.device('cuda')

In [0]:
# Read Functions

def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    """
    Read articles from files matching patterns <file_pattern> from  
    the directory <folder_name>. 
    The content of the article is saved in the array <sentence_list>.
    Each element of <sentence_list> is one line of the article.
    Two additional arrays are created: <sentence_id_list> and
    <article_id_list>, holding the id of the sentences and the article.
    The arrays <article_id_list> and <sentence_id_list> are the first
    two columns of the predictions for the article, i.e. the format
    of the file <dev_template_labels_file>, they will be used to match
    the sentences with their gold labels in <train_labels_folder> 
    or <dev_template_labels_file>.
    """
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    offsets = []
    start = 0
    end = 0
    article_lens = {}
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with open(filename, "r", encoding="utf-8") as f:
            for sentence_id, row in enumerate(f.readlines(), 1):
                sentence_list.append(row.rstrip())
                article_id_list.append(article_id)
                sentence_id_list.append(str(sentence_id))
                end = start + len(row)
                offsets.append([start, end])
                start = end
        article_lens[article_id] = end
    return article_id_list, sentence_id_list, sentence_list, offsets, article_lens


def are_ids_aligned(article_id_list, sentence_id_list, 
                    reference_article_id_list, reference_sentence_id_list):
    """
    check whether the two lists of ids of the articles and the sentences are aligned
    """
    for art, ref_art, sent, ref_sent in zip(article_id_list, reference_article_id_list, 
                                            sentence_id_list, reference_sentence_id_list):
        if art != ref_art:
            print("ERROR: article ids do not match: article id = %s, reference article id = %s"%(art, ref_art))
            return False
        if sent != ref_sent:
            print("ERROR: sentence ids do not match: article id:%s,%s sentence id:%s,%s" %(art, ref_art, sent, ref_sent))
            return False
    return True


def read_predictions_from_file(filename):
    """
    Reader for the gold file and the template output file. 
    Return values are three arrays with article ids, sentence ids and labels 
    (or ? in the case of a template file). For more info on the three 
    arrays see comments in function read_articles_from_file_list()
    """
    articles_id, sentence_id_list, gold_labels = ([], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, sentence_id, gold_label = row.rstrip().split("\t")
            articles_id.append(article_id)
            sentence_id_list.append(sentence_id)
            gold_labels.append(gold_label)  
      
    return articles_id, sentence_id_list, gold_labels
  
  
def read_flc_predictions_from_file(article_lens, flc_filename):
    """
    Reader for the gold file and the template output file. 
    Return values are three arrays with article ids, sentence ids and labels 
    (or ? in the case of a template file). For more info on the three 
    arrays see comments in function read_articles_from_file_list()
    """
    article = []
    with open(flc_filename, "r") as f:
      for row in f.readlines():
        article_id, gold_labels, begin_offset, end_offset = row.rstrip().split("\t")
        if len(article) == 0:
          l = article_lens[article_id]
          for x in range(l):
            article.append(set())
        for offset in range(begin_offset, end_offset):
          for label in gold_labels.split(','):
            article[offset].add(label)
    return article_id, article


def read_predictions_from_file_list(article_lens, folder_name, file_pattern, flc_file_pattern):
    """
    Reader for the gold label files and the template output files
    <folder_name> is the folder hosting the files. 
    <file_pattern> values are {"*.task-SLC.labels", "*.task-SLC-template.out"}. 
    Return values are three arrays with article ids, sentence ids and labels 
    (or ? in the case of a template file). For more info on the three 
    arrays see comments in function read_articles_from_file_list()
    """
    gold_file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles_id, sentence_id_list, gold_labels = ([], [], [])
    for filename in sorted(gold_file_list):
        art_ids, sent_ids, golds = read_predictions_from_file(filename)
        articles_id += art_ids
        sentence_id_list += sent_ids
        gold_labels += golds
        
    gold_flc_file_list = glob.glob(os.path.join(folder_name, flc_file_pattern))
    flc_labeled_articles = {}
    for filename in sorted(gold_flc_file_list):
        article_id, flc_labeled_article = read_flc_predictions_from_file(article_lens, filename)
        flc_labeled_articles[article_id] = flc_labeled_article   
    return articles_id, sentence_id_list, gold_labels, flc_labeled_articles

In [0]:
# Split and evaluate

train_folder = "datasets/train-articles" 
train_labels_folder = "datasets/train-labels-SLC"
dev_folder = "datasets/dev-articles"
test_folder = "test"

In [0]:
train_article_ids, train_sentence_ids, sentence_list, offsets, article_lens = read_articles_from_file_list(train_folder)
reference_articles_id, reference_sentence_id_list, gold_labels, flc_labeled_articles = read_predictions_from_file_list(article_lens,
                                                                                                                       train_labels_folder,                                                                                                                       
                                                                                                                       "*.task-SLC.labels",
                                                                                                                      "*.task-FLC.labels")

dev_article_id_list, dev_sentence_id_list, dev_sentence_list, dev_offsets, dev_article_lens = read_articles_from_file_list(dev_folder)
gold_labels = [0 if label == 'non-propaganda' else 1 for label in gold_labels]
test_article_id_list, test_sentence_id_list, test_sentence_list, test_offsets, test_article_lens = read_articles_from_file_list(test_folder)

dev_ids = [article_id + dev_sentence_id_list[ind] for ind, article_id in enumerate(dev_article_id_list)]
test_ids = [article_id + test_sentence_id_list[ind] for ind, article_id in enumerate(test_article_id_list)]

ids = [article_id + train_sentence_ids[ind] for ind, article_id in enumerate(train_article_ids)]
refids = [article_id + reference_sentence_id_list[ind] for ind, article_id in enumerate(reference_articles_id)]

dev_datadf = []
test_datadf = []
labeldf = []
datadf = []
for ind, segment_id in enumerate(dev_ids):
  dev_datadf.append({'id': segment_id, 'text': dev_sentence_list[ind]})
for ind, segment_id in enumerate(test_ids):
  test_datadf.append({'id': segment_id, 'text': test_sentence_list[ind]})
for ind, segment_id in enumerate(ids):
  datadf.append({'id': segment_id, 'text': sentence_list[ind]})
for ind, segment_id in enumerate(refids):
  labeldf.append({'id': segment_id, 'label': gold_labels[ind]})
datadf = pd.DataFrame(datadf)
labeldf = pd.DataFrame(labeldf)
dev_datadf = pd.DataFrame(dev_datadf)
test_datadf = pd.DataFrame(test_datadf)


datadf = pd.merge(datadf, labeldf, on='id', how='inner')
datadf = datadf.drop_duplicates('id')

# Prediction based on Perspective API scores



In [0]:
PERSPECTIVE_KEY = 'YOUR_PERSPECTIVE_API_KEY_HERE'
ATTRIBUTES = ['TOXICITY', 'SEVERE_TOXICITY', 'IDENTITY_ATTACK', 'INSULT',
'PROFANITY', 'THREAT', 'SEXUALLY_EXPLICIT', 'FLIRTATION',
'INFLAMMATORY', 'OBSCENE', 'LIKELY_TO_REJECT', 'UNSUBSTANTIAL']

def call_perspective_api(text, attributes, PERSPECTIVE_KEY):
    backoff_counter = 1
    while True:
      path = ' https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key=%s' % PERSPECTIVE_KEY
      request = {
          'comment' : {'text' : text},
          'requestedAttributes' : { c : {} for c in attributes},
          'doNotStore' : True,
      }
      response = requests.post(path, json=request)
      prob = {}
      if response.status_code == 429:
         time.sleep(10 * backoff_counter)
         backoff_counter += 1
      else:
        break
    if response.status_code == 200:
      data = json.loads(response.text)
      scores_simplified = {}
      attribute_scores = data['attributeScores']
      for attr, data in attribute_scores.items():
          prob[attr] = data['summaryScore']['value']
      return prob
    else:
      for attr in attributes:
        prob[attr] = -1
      return prob

In [0]:
X_train = []
for sentence in sentence_list:
  probs = call_perspective_api(sentence, ATTRIBUTES, PERSPECTIVE_KEY)
  X_train.append([probs[a] for a in ATTRIBUTES])
  
X_dev = []
for sentence in dev_sentence_list:
  probs = call_perspective_api(sentence, ATTRIBUTES, PERSPECTIVE_KEY)
  X_dev.append([probs[a] for a in ATTRIBUTES])

In [0]:
def train_main(train_folder, train_labels_folder, task_SLC_output_file,
               dev_labels_folder=None, dev_labels_file=None):
  # loading articles' content from *.txt files in the train folder
  train_article_ids, train_sentence_ids, sentence_list = read_articles_from_file_list(train_folder)

  # loading gold labels, articles ids and sentence ids from files *.task-SLC.labels in the train labels folder 
  reference_articles_id, reference_sentence_id_list, gold_labels = read_predictions_from_file_list(train_labels_folder, "*.task-SLC.labels")

  # checking that the number of sentences in the raw training set and the gold label file
  if not are_ids_aligned(train_article_ids, train_sentence_ids, reference_articles_id, reference_sentence_id_list):
    sys.exit("Exiting: training set article ids and gold labels are not aligned")
  print("Loaded %d sentences from %d articles" % (len(sentence_list), len(set(train_article_ids))))

  # compute one feature for each sentence: the length of the sentence and train the model
  train = np.array([ len(sentence) for sentence in sentence_list ]).reshape(-1, 1)
  model = LogisticRegression(penalty='l2', class_weight='balanced', solver="lbfgs")
  model.fit(train, gold_labels)

  # reading data from the development set
  dev_article_id_list, dev_sentence_id_list, dev_sentence_list = read_articles_from_file_list(dev_folder)
  if dev_labels_folder is not None:
    assert(dev_labels_file is not None)
    reference_articles_id, reference_sentence_id_list, dev_labels = read_predictions_from_file_list(dev_labels_folder, "*.task-SLC.labels")
    # writing dev gold labels to file
    with open(dev_labels_file, "w") as fout:
      for article_id, sentence_id, prediction in zip(reference_articles_id, reference_sentence_id_list, dev_labels):
        fout.write("%s\t%s\t%s\n" % (article_id, sentence_id, prediction))
  else: 
    reference_articles_id, reference_sentence_id_list, dev_labels = read_predictions_from_file(dev_labels_file)
  if not are_ids_aligned(dev_article_id_list, dev_sentence_id_list, reference_articles_id, reference_sentence_id_list):
    sys.exit("Exiting: development set article ids and gold labels are not aligned")
    
  # computing the predictions on the development set
  dev = np.array([ len(sentence) for sentence in dev_sentence_list ]).reshape(-1, 1)
  predictions = model.predict(dev)

  # writing predictions to file
  with open(task_SLC_output_file, "w") as fout:
    for article_id, sentence_id, prediction in zip(dev_article_id_list, dev_sentence_id_list, predictions):
       print("%s\t%s\t%s" % (article_id, sentence_id, prediction))
       fout.write("%s\t%s\t%s\n" % (article_id, sentence_id, prediction))
  print("Predictions written to file " + task_SLC_output_file)

In [0]:
model = LogisticRegression(penalty='l2', class_weight='balanced', solver="lbfgs")
model.fit(X_train, gold_labels)

In [0]:
predictions = model.predict(X_dev)

# writing predictions to file
with open('perspective_output', "w") as fout:
  for article_id, sentence_id, prediction in zip(dev_article_id_list, dev_sentence_id_list, predictions):
     fout.write("%s\t%s\t%s\n" % (article_id, sentence_id, 'propaganda' if prediction > 0.5 else 'non-propaganda'))

# SLC task

## BERT

Modified from https://www.kaggle.com/yuval6967/toxic-bert-plain-vanila

### Load

In [0]:
!mkdir working
!wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
!unzip wwm_uncased_L-24_H-1024_A-16.zip 

In [0]:
BERT_MODEL_PATH = 'wwm_uncased_L-24_H-1024_A-16/'

MAX_SEQUENCE_LENGTH = max([len(sentence.split(' ')) for sentence in sentence_list])
SEED = 1234
WORK_DIR = "working/"
num_to_load=int(len(datadf)/10 * 9)                       #Train size to match time limit
valid_size= len(datadf) - num_to_load                          #Validation Size
EPOCHS = 1
fold = 6

In [0]:
# Translate model from tensorflow to pytorch
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt',
BERT_MODEL_PATH + 'bert_config.json',
WORK_DIR + 'pytorch_model.bin')
shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')

In [0]:
bert_config = BertConfig('wwm_uncased_L-24_H-1024_A-16/'+'bert_config.json')

In [0]:
# Converting the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)


In [0]:
# Converting the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
def get_unigrams_and_bigrams(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    unigrams = {}
    bigrams = {}
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        previous = None
        for token in tokens_a:
          if not(token in unigrams):
            unigrams[token] = tokenizer.convert_tokens_to_ids(["[CLS]"]+[token]+["[SEP]"])+[0] * (max_seq_length - 1)
          if not(previous == None):
            if not((previous, token) in bigrams):
              bigrams[(previous, token)] = tokenizer.convert_tokens_to_ids(["[CLS]"]+[previous, token]+["[SEP]"])+[0] * (max_seq_length - 2)
          previous = token
    return unigrams, bigrams

In [0]:
train_df = datadf
dev_df = dev_datadf
all_df = pd.concat([datadf, dev_datadf, test_datadf], axis=0)
print('loaded %d records' % len(train_df))

In [0]:
unigrams, bigrams = get_unigrams_and_bigrams(all_df['text'].fillna('DUMMY_VALUE'),MAX_SEQUENCE_LENGTH,tokenizer)

In [0]:
# Make sure all comment_text values are strings
train_df['text'] = train_df['text'].astype(str) 
dev_df['text'] = dev_df['text'].astype(str)

# Random shuffle train list
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [0]:
sequences = convert_lines(train_df["text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)
train_df=train_df.fillna(0)

dev_sequences = convert_lines(dev_df["text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)
dev_df=dev_df.fillna(0)

100%|██████████| 16956/16956 [00:06<00:00, 2779.29it/s]
 16%|█▌        | 347/2235 [00:00<00:00, 3464.44it/s]

9841


100%|██████████| 2235/2235 [00:00<00:00, 2993.52it/s]

1284





In [0]:
train_df['target'] = train_df['label']
y_columns=['target']

In [0]:
if type(fold) is int and (fold < 9):
  X = np.concatenate((sequences[:num_to_load-valid_size*fold],
                     sequences[-valid_size*fold:]), axis=0)
  y = np.concatenate((train_df[y_columns].values[:num_to_load-valid_size*fold],
                      train_df[y_columns].values[-valid_size*fold:]),
                     axis=0)

  X_val = sequences[num_to_load-valid_size*fold:-valid_size*fold]                
  y_val = train_df[y_columns].values[num_to_load-valid_size*fold:-valid_size*fold]

  X_test = dev_sequences
if fold == 0 or (fold == 'all'):
  X = sequences[:num_to_load]                
  y = train_df[y_columns].values[:num_to_load]
  X_val = sequences[num_to_load:]                
  y_val = train_df[y_columns].values[num_to_load:]
  X_test = dev_sequences 
if fold == 9:
  X = sequences[-valid_size*fold:]                
  y = train_df[y_columns].values[-valid_size*fold:]
  X_val = sequences[:-valid_size*fold:]                
  y_val = train_df[y_columns].values[:-valid_size*fold]
  X_test = dev_sequences 

In [0]:
train_df = train_df.drop(['text'],axis=1)

In [0]:
batch_size = 32
lr=1e-5
accumulation_steps=2
alpha = 5
EPOCHS = 1

### Train

In [0]:
train_df=train_df.head(num_to_load)
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long),
                                               torch.tensor(y,dtype=torch.float))



output_model_file = "bert_pytorch_{}_{}_{}_{}_{}_{}_{}.bin".format(lr, accumulation_steps,
                                                                  EPOCHS, SEED, alpha, fold,
                                                                  MAX_SEQUENCE_LENGTH)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model = BertForSequenceClassification.from_pretrained(WORK_DIR,cache_dir=None,
                                                      num_labels=len(y_columns))
model.zero_grad()
model = model.to('cuda')
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
train = train_dataset

num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps)

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=lr,
                     warmup=0.05,
                     t_total=num_train_optimization_steps)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
model = torch.nn.DataParallel(model)

model=model.train()

<torch._C.Generator at 0x7ff44b068890>

In [0]:
positives = float(sum(y))
total = float(len(y))

In [0]:
tq = tqdm(range(EPOCHS))
for ind, epoch in enumerate(tq):
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    avg_loss = 0.
    avg_accuracy = 0.
    lossf=None
    tk0 = tqdm(enumerate(train_loader),
                        total=len(train_loader),leave=False)
    optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
    tp = 0.
    fp = 0.
    fn = 0.
    tn = 0.

    pos_weight = [positives / (total - positives)*alpha]
    for i,(x_batch, y_batch) in tk0:
#        optimizer.zero_grad()
        y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)

        loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device), pos_weight=torch.FloatTensor(pos_weight).to(device))
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.mean().backward()
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        avg_loss += loss.item() / len(train_loader)
        tp += torch.sum((torch.sigmoid(y_pred[:,0])>0.5) *
        ((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device))).item()
        fp += torch.sum((torch.sigmoid(y_pred[:,0])>0.5) *
        ((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]<0.5).to(device))).item()
        fn += torch.sum((torch.sigmoid(y_pred[:,0])<0.5) *
                        ((torch.sigmoid(y_pred[:,0])<0.5) == (y_batch[:,0]>0.5).to(device))).item()
        tn += torch.sum((torch.sigmoid(y_pred[:,0])<0.5) *
                        ((torch.sigmoid(y_pred[:,0])<0.5) == (y_batch[:,0]<0.5).to(device))).item()
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
        tk0.set_postfix(loss = lossf, tp=tp, fn=fn)
        
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print(precision, recall, avg_accuracy)
    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)


torch.save(model.state_dict(), output_model_file)

### Validation

In [0]:
# Run validation
# The following 3 lines are not needed but show how to download the model for prediction
valid_model = BertForSequenceClassification(bert_config,num_labels=len(y_columns))
valid_model = torch.nn.DataParallel(valid_model)
valid_model.load_state_dict(torch.load(output_model_file ))

valid_model.to(device)
for param in valid_model.parameters():
    param.requires_grad=False
valid_model.eval()

In [0]:
valid_preds = np.zeros((len(X_val)))
valid = torch.utils.data.TensorDataset(torch.tensor(X_val,dtype=torch.long))
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

tk0 = tqdm_notebook(valid_loader)
    
for i,(x_batch,)  in enumerate(tk0):
    pred = valid_model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
    pred = torch.sigmoid(pred)
    valid_preds[i*batch_size:(i+1)*batch_size]=pred[:,0].detach().cpu().squeeze().numpy()


In [0]:

tp = np.sum((valid_preds>0.5) * ((valid_preds>0.5) == (y_val.flatten()>0.5)))
fp = np.sum((valid_preds>0.5) * ((valid_preds>0.5) == (y_val.flatten()<0.5)))
fn = np.sum((valid_preds<0.5) * ((valid_preds<0.5) == (y_val.flatten()>0.5)))
tn = np.sum((valid_preds<0.5) * ((valid_preds<0.5) == (y_val.flatten()<0.5)))
  
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (recall * precision) / (recall + precision)
print(precision, recall, f1)

0.48015122873345933 0.5427350427350427 0.5095285857572718


### Get result on testset

In [0]:
lr=1e-5
accumulation_steps=2
alpha = 5
EPOCHS = 1
SEED = 1234
batch_size=32
bert_config = BertConfig('wwm_uncased_L-24_H-1024_A-16/'+'bert_config.json')

In [0]:
test_preds = np.zeros((len(X_test)))
folds = 10

for fold in range(folds):
  output_model_file = "bert_pytorch_{}_{}_{}_{}_{}_{}.bin".format(lr, accumulation_steps,
                                                                EPOCHS, SEED, alpha, fold)
  _# Run validation
  # The following 3 lines are not needed but show how to download the model for prediction
  valid_model = BertForSequenceClassification(bert_config,num_labels=len(y_columns))
  valid_model = torch.nn.DataParallel(valid_model)
  valid_model.load_state_dict(torch.load(output_model_file ))

  valid_model.to(device)
  for param in valid_model.parameters():
    param.requires_grad=False
  valid_model.eval()

  test = torch.utils.data.TensorDataset(torch.tensor(X_test,dtype=torch.long))
  test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

  tk0 = tqdm_notebook(test_loader)
    
  for i,(x_batch,)  in enumerate(tk0):
    pred = valid_model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
    pred = torch.sigmoid(pred)
    test_preds[i*batch_size:(i+1)*batch_size]+=pred[:,0].detach().cpu().squeeze().numpy() / folds

output_labels = ['non-propaganda', 'propaganda']
with open(task_SLC_output_file, "w") as fout:
  for article_id, sentence_id, prediction in zip(dev_article_id_list, dev_sentence_id_list, test_preds):
    fout.write("%s\t%s\t%s\n" % (article_id, sentence_id, output_labels[int(prediction > 0.5)]))

In [0]:
def run_model(X_test, num_labels=1):
  test_preds = np.zeros((len(X_test)))
  folds = 10

  for fold in range(folds):
    output_model_file = "bert_pytorch_{}_{}_{}_{}_{}_{}.bin".format(lr, accumulation_steps,
                                                                EPOCHS, SEED, alpha, fold)
    # load model
    valid_model = BertForSequenceClassification(bert_config,num_labels=num_labels)
    valid_model = torch.nn.DataParallel(valid_model)
    valid_model.load_state_dict(torch.load(output_model_file ))

    valid_model.to(device)
    for param in valid_model.parameters():
      param.requires_grad=False
    valid_model.eval()

    test = torch.utils.data.TensorDataset(torch.tensor(X_test,dtype=torch.long))
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

    tk0 = tqdm_notebook(test_loader)
    
    for i,(x_batch,)  in enumerate(tk0):
      pred = valid_model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
      pred = torch.sigmoid(pred)
      test_preds[i*batch_size:(i+1)*batch_size]+=pred[:,0].detach().cpu().squeeze().numpy() / folds
  return test_preds

### Unigram and Bigram Analysis

In [0]:
def unigrams_and_bigrams(tokens, num_labels=1):

  ind = 0
  X_test = []
  mapping = {}
  for token, seq in tokens.items():
    X_test.append(seq)
    mapping[ind] = token
    ind +=1

    
  test_preds = run_model(X_test, num_labels)
  
  res = []
  for ind, score in enumerate(test_preds):
    res.append((score, mapping[ind]))
  
  return res

In [0]:
test_scores = run_model(test_seqs)

In [0]:
unigram_scores = unigrams_and_bigrams(unigrams)
bigram_scores = unigrams_and_bigrams(bigrams)

In [0]:
unigram_scores = sorted(unigram_scores)
bigram_scores = sorted(bigram_scores)

In [0]:
','.join([u[1] for u in unigram_scores[-20:]])

'devastating,cruel,vile,irrational,absurd,brutal,vicious,stupid,coward,awful,ignorant,unbelievable,doomed,idiot,terrifying,disgusting,horrible,hideous,horrific,pathetic'

In [0]:
','.join([' '.join(u[1]) for u in bigram_scores[-20:]])

'shame ##less,totally insane,a horrible,utterly unacceptable,hysterical nonsense,the horrible,this horrific,absolutely disgusting,monumental stupidity,a pathetic,a disgusting,absolutely worthless,truly disgusting,utterly insane,this murderous,incredibly stupid,monstrous fraud,this lunatic,a disgrace,a hideous'

## Baseline




Baseline for Task SLC

Our baseline uses a logistic regression classifier on one feature only: the length of the sentence.

Requirements: sklearn, numpy


In [0]:
def train_main(train_folder, train_labels_folder, task_SLC_output_file,
               dev_labels_folder=None, dev_labels_file=None):
  # loading articles' content from *.txt files in the train folder
  train_article_ids, train_sentence_ids, sentence_list = read_articles_from_file_list(train_folder)

  # loading gold labels, articles ids and sentence ids from files *.task-SLC.labels in the train labels folder 
  reference_articles_id, reference_sentence_id_list, gold_labels = read_predictions_from_file_list(train_labels_folder, "*.task-SLC.labels")

  # checking that the number of sentences in the raw training set and the gold label file
  if not are_ids_aligned(train_article_ids, train_sentence_ids, reference_articles_id, reference_sentence_id_list):
    sys.exit("Exiting: training set article ids and gold labels are not aligned")
  print("Loaded %d sentences from %d articles" % (len(sentence_list), len(set(train_article_ids))))

  # compute one feature for each sentence: the length of the sentence and train the model
  train = np.array([ len(sentence) for sentence in sentence_list ]).reshape(-1, 1)
  model = LogisticRegression(penalty='l2', class_weight='balanced', solver="lbfgs")
  model.fit(train, gold_labels)

  # reading data from the development set
  dev_article_id_list, dev_sentence_id_list, dev_sentence_list = read_articles_from_file_list(dev_folder)
  if dev_labels_folder is not None:
    assert(dev_labels_file is not None)
    reference_articles_id, reference_sentence_id_list, dev_labels = read_predictions_from_file_list(dev_labels_folder, "*.task-SLC.labels")
    # writing dev gold labels to file
    with open(dev_labels_file, "w") as fout:
      for article_id, sentence_id, prediction in zip(reference_articles_id, reference_sentence_id_list, dev_labels):
        fout.write("%s\t%s\t%s\n" % (article_id, sentence_id, prediction))
  else: 
    reference_articles_id, reference_sentence_id_list, dev_labels = read_predictions_from_file(dev_labels_file)
  if not are_ids_aligned(dev_article_id_list, dev_sentence_id_list, reference_articles_id, reference_sentence_id_list):
    sys.exit("Exiting: development set article ids and gold labels are not aligned")
    
  # computing the predictions on the development set
  dev = np.array([ len(sentence) for sentence in dev_sentence_list ]).reshape(-1, 1)
  predictions = model.predict(dev)

  # writing predictions to file
  with open(task_SLC_output_file, "w") as fout:
    for article_id, sentence_id, prediction in zip(dev_article_id_list, dev_sentence_id_list, predictions):
       print("%s\t%s\t%s" % (article_id, sentence_id, prediction))
       fout.write("%s\t%s\t%s\n" % (article_id, sentence_id, prediction))
  print("Predictions written to file " + task_SLC_output_file)

In [0]:
# Split and evaluate

train_folder = "datasets/train-train-articles" 
dev_folder = "datasets/train-dev-articles"
train_labels_folder = "datasets/train-train-labels-SLC"
task_SLC_output_file = "baseline-output-SLC-eval.txt"

train_main(train_folder, train_labels_folder, task_SLC_output_file,
               dev_labels_folder="datasets/train-dev-labels-SLC/", dev_labels_file="dev-labels-SLC")
%run tools/task-SLC_scorer -s baseline-output-SLC-eval.txt -r dev-labels-SLC

In [0]:
# Generate Answer on development set

train_folder = "datasets/train-articles" 
dev_folder = "datasets/dev-articles"
train_labels_folder = "datasets/train-labels-SLC"
task_SLC_output_file = "baseline-output-SLC.txt"

train_main(train_folder, train_labels_folder, task_SLC_output_file,
               dev_labels_file="datasets/dev.template-output-SLC.out")

# FLC task

In [0]:
dev_folder = "datasets/dev-articles"
propaganda_techniques_file = "tools/data/propaganda-techniques-names.txt"
task_FLC_output_file = "baseline-output-FLC.txt"

random.seed(10) # to make runs deterministic

# loading articles' content from *.txt files in the dev folder
file_list = glob.glob(os.path.join(dev_folder, "*.txt"))
articles_content, articles_id = ([], [])
for filename in file_list:
    with open(filename, "r", encoding="utf-8") as f:
        articles_content.append(f.read())
        articles_id.append(os.path.basename(filename).split(".")[0][7:])

with open(propaganda_techniques_file, "r") as f:
    propaganda_techniques_names = [ line.rstrip() for line in f.readlines() ]

with open(task_FLC_output_file, "w") as fout:
    for article_content, article_id in zip(articles_content, articles_id):
        start_fragment, end_fragment, article_length = (0, 0, len(article_content))
        current_article_annotations = []
        while end_fragment < article_length:
            if end_fragment > 0:
                technique_name = propaganda_techniques_names[random.randint(0, len(propaganda_techniques_names)-1)]
                # check that there is no other annotation for the same anrticle and technique that overlaps
                intersection_length = 0
                if len(current_article_annotations) > 0:
                    span_annotation = set(range(start_fragment, end_fragment))
                    intersection_length = sum( [ len(span_annotation.intersection(previous_fragment))
                             for previous_technique, previous_fragment in current_article_annotations 
                             if previous_technique==technique_name ])
                if len(current_article_annotations) == 0 or intersection_length > 0:
                    fout.write("%s\t%s\t%s\t%s\n" % (article_id, technique_name, start_fragment, end_fragment))
                    current_article_annotations.append((technique_name, set(range(start_fragment, end_fragment))))
            start_fragment += random.randint(0, max(1, article_length-start_fragment))
            end_fragment = min(start_fragment + random.randint(1,25), article_length)
        print("article %s: added %d fragments" % (article_id, len(current_article_annotations)))    

print("Predictions written to file " + task_FLC_output_file)

In [0]:
%run tools/task-FLC_scorer -s baseline-output-FLC.txt -r tools/data/FLC-sample-labels -t tools/data/propaganda-techniques-names.txt