## Load kialo data from scratch 
#### (scroll down if want to use already processed kialo data)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [3]:
kialo_ds_path = '../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/kialo_data/'

In [4]:
train_kialo_df = pd.read_pickle(kialo_ds_path + '/kialo_train_df.pkl')
valid_kialo_df = pd.read_pickle(kialo_ds_path + '/kialo_valid_df.pkl')
test_kialo_df = pd.read_pickle(kialo_ds_path + '/kialo_test_df.pkl')

In [5]:
def create_df(df):
    
    df = df.groupby('conclusion_text').agg({
        'premises': lambda x: list(x)[0],
        'counter' : lambda x: list(x)
    }).reset_index()
    
    output_data = []

    for idx, row in df.iterrows():
        for premise in row['premises']:
            num_tokens = len(premise.split())
            if  num_tokens <= 200 and num_tokens > 3:
                output_data.append((row['conclusion_text'], premise, 0))

        for counter in row['counter']:
            num_tokens = len(counter.split())
            if  num_tokens <= 200 and num_tokens > 3:
                output_data.append((row['conclusion_text'], counter, 1))

    output_df = pd.DataFrame(output_data, columns=['claim1', 'claim2', 'label'])
    
    #Balancing the dataframe
    g = output_df.groupby('label')
    output_df = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))
    
    return output_df

In [6]:
train_df = create_df(train_kialo_df)
valid_df = create_df(valid_kialo_df)
test_df  = create_df(test_kialo_df)

In [7]:
train_df.label.value_counts()

1    47832
0    47832
Name: label, dtype: int64

In [8]:
valid_df.label.value_counts()

1    3858
0    3858
Name: label, dtype: int64

In [9]:
test_df.label.value_counts()

1    11227
0    11227
Name: label, dtype: int64

In [10]:
train_df.to_csv('../data/kialo_stance_classification_training_data.csv', index=False)
test_df.to_csv('../data/kialo_stance_classification_test_data.csv', index=False)
valid_df.to_csv('../data/kialo_stance_classification_valid_data.csv', index=False)

## Load already processed kialo data for tokenization and training for model

In [11]:
from datasets import Dataset
from transformers import TrainingArguments, RobertaTokenizer, RobertaForSequenceClassification, TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import Trainer

In [12]:
train_df = pd.read_csv('../data/kialo_stance_classification_training_data.csv')
test_df  = pd.read_csv('../data/kialo_stance_classification_test_data.csv')
valid_df = pd.read_csv('../data/kialo_stance_classification_valid_data.csv')

### convert df into dataset

In [13]:
train_df['input_txt'] = train_df.apply(lambda x: x['claim1'] + ' </s> ' + x['claim2'], axis=1)
valid_df['input_txt'] = valid_df.apply(lambda x: x['claim1'] + ' </s> ' + x['claim2'], axis=1)

In [14]:
train_df.sample(10).head(n=5)

Unnamed: 0,claim1,claim2,label,input_txt
93467,Governments shouldn't subsidize ethanol.,"It is the nature of governments to ""subsidize"" or ""mandate"" preferences. They do it by establishing and enforcing laws They should promote those areas that are in the public best interest. Replacing fossil fuels which have historically benefited countries financially have come with devastating environmental and sociatal consequences. Government should therefore support and promote technologies that better address sociatal needs and minimize negative consequences.",1,"Governments shouldn't subsidize ethanol. </s> It is the nature of governments to ""subsidize"" or ""mandate"" preferences. They do it by establishing and enforcing laws They should promote those areas that are in the public best interest. Replacing fossil fuels which have historically benefited countries financially have come with devastating environmental and sociatal consequences. Government should therefore support and promote technologies that better address sociatal needs and minimize negative consequences."
57615,For economic reasons.,"Fossil Fuel cars have been a primary driver of economies world wide for nearly 100 years, driving innovation, employment, making transport cheaper and more efficient. It will take at least 20 years for EVs to gain significant market share to make such a claim.",1,"For economic reasons. </s> Fossil Fuel cars have been a primary driver of economies world wide for nearly 100 years, driving innovation, employment, making transport cheaper and more efficient. It will take at least 20 years for EVs to gain significant market share to make such a claim."
58404,Portraying gender equality in video games may make future games less violent which is beneficial.,There is no beneficial effect of making future games less violent. Games are already split up into adult games and children-friendly games. Showing violence is also a way to show the consequences of it. Games such as Heavy Rain are a massive success.,1,Portraying gender equality in video games may make future games less violent which is beneficial. </s> There is no beneficial effect of making future games less violent. Games are already split up into adult games and children-friendly games. Showing violence is also a way to show the consequences of it. Games such as Heavy Rain are a massive success.
69168,"The right to bodily autonomy gives individuals a right to consent to harm, or even death.","We regulate what people are allowed to do all of the time in the name of public safety or personal interest \(e.g. drinking age, medicine regulation\).",1,"The right to bodily autonomy gives individuals a right to consent to harm, or even death. </s> We regulate what people are allowed to do all of the time in the name of public safety or personal interest \(e.g. drinking age, medicine regulation\)."
32116,"The legalization of drugs means that they are taken, and disposed of, more safely and cautiously than in the current environment.",Dangerous substitute drugs will be contained.,0,"The legalization of drugs means that they are taken, and disposed of, more safely and cautiously than in the current environment. </s> Dangerous substitute drugs will be contained."


In [15]:
train_dataset = Dataset.from_pandas(train_df.sample(frac=1))
valid_dataset = Dataset.from_pandas(valid_df.sample(frac=1))

## Apply Roberta model

In [21]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    #precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = metric.compute(predictions=preds, references=labels)
    return {
        'accuracy': acc['accuracy'],
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
    }

In [17]:
# tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
# model = RobertaForSequenceClassification.from_pretrained('roberta-large').cuda()
model = AutoModelForSequenceClassification.from_pretrained('roberta-base',num_labels=2).cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [18]:
tokenized_train = train_dataset.map(lambda a: tokenizer(a['input_txt'], padding='max_length', max_length=256, truncation=True),batched=True)
tokenized_valid = valid_dataset.map(lambda a: tokenizer(a['input_txt'], padding='max_length', max_length=256, truncation=True),batched=True)

  0%|          | 0/96 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [22]:
training_args = TrainingArguments('../data/output/stance_classification', 
                                  evaluation_strategy="epoch", 
                                  eval_steps=1000,
                                  save_steps=4000,
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  save_total_limit=5,
                                  num_train_epochs=10 , 
                                  per_device_train_batch_size=8)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_valid, compute_metrics=compute_metrics)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [23]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: input_txt, claim2, __index_level_0__, claim1.
***** Running training *****
  Num examples = 95664
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 59790


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6936,0.694453,0.5
2,0.6942,0.693455,0.5
3,0.6938,0.693465,0.5


Saving model checkpoint to ../data/output/stance_classification/checkpoint-4000
Configuration saved in ../data/output/stance_classification/checkpoint-4000/config.json
Model weights saved in ../data/output/stance_classification/checkpoint-4000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: input_txt, claim2, __index_level_0__, claim1.
***** Running Evaluation *****
  Num examples = 7716
  Batch size = 16
Saving model checkpoint to ../data/output/stance_classification/checkpoint-8000
Configuration saved in ../data/output/stance_classification/checkpoint-8000/config.json
Model weights saved in ../data/output/stance_classification/checkpoint-8000/pytorch_model.bin
Deleting older checkpoint [../data/output/stance_classification/checkpoint-4000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequ

KeyboardInterrupt: 