In [9]:
import pandas as pd

train_stances = pd.read_csv('train_stances_small.csv')
train_bodies = pd.read_csv('train_bodies_google_pegasus-cnn_dailymail_cleaned.csv')
test_stances = pd.read_csv('competition_test_stances.csv')
test_bodies = pd.read_csv('test_bodies_google_pegasus-cnn_dailymail_cleaned.csv')

In [13]:
from sklearn.model_selection import train_test_split

mapping = {'agree': 0, 'disagree':1, 'discuss':2, 'unrelated':3}


# Extract training data
merged = pd.merge(train_stances, train_bodies, on='Body ID')
merged = merged[['Headline', 'articleBody', 'Stance']]
merged['Stance'] = merged['Stance'].map(mapping)

train_df= merged.rename(columns={'Headline':'text_a','articleBody':'text_b','Stance':'labels'})

# Extract testing data
merged = pd.merge(test_stances, test_bodies, on='Body ID')
merged = merged[['Headline', 'articleBody', 'Stance']]
merged['Stance'] = merged['Stance'].map(mapping)

test_df= merged.rename(columns={'Headline':'text_a','articleBody':'text_b','Stance':'labels'})
labels_test = pd.Series(test_df['labels']).to_numpy()

In [15]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from simpletransformers.classification import ClassificationModel

# Set model arguments
model = ClassificationModel('roberta', 'roberta-base', use_cuda=False,num_labels=4, args={
    'learning_rate':1e-5,
    'num_train_epochs': 5,
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'process_count': 10,
    'train_batch_size': 4,
    'eval_batch_size': 4,
    'max_seq_length': 64,
    'fp16': True
})

# Train roberta on our training set
model.train_model(train_df)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

  0%|          | 0/8340 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/2085 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/2085 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/2085 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/2085 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/2085 [00:00<?, ?it/s]

(10425, 0.43092233065938007)

In [16]:
import numpy as np
_, model_outputs_test, _ = model.eval_model(test_df)

preds_test = np.argmax(model_outputs_test, axis=1)

  0%|          | 0/25413 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/6354 [00:00<?, ?it/s]

In [17]:
from sklearn.metrics import f1_score

def calculate_f1_scores(y_true, y_predicted):
    f1_macro = f1_score(y_true, y_predicted, average='macro')
    f1_classwise = f1_score(y_true, y_predicted, average=None, labels=[0, 1, 2, 3])

    resultstring = "F1 macro: {:.3f}".format(f1_macro * 100) + "% \n"
    resultstring += "F1 agree: {:.3f}".format(f1_classwise[0] * 100) + "% \n"
    resultstring += "F1 disagree: {:.3f}".format(f1_classwise[1] * 100) + "% \n"
    resultstring += "F1 discuss: {:.3f}".format(f1_classwise[2] * 100) + "% \n"
    resultstring += "F1 unrelated: {:.3f}".format(f1_classwise[3] * 100) + "% \n"

    return resultstring

calculate_f1_scores(preds_test, labels_test)

'F1 macro: 68.955% \nF1 agree: 59.361% \nF1 disagree: 41.296% \nF1 discuss: 77.016% \nF1 unrelated: 98.148% \n'