In [8]:
!python3

Python 3.6.9 (default, Jan 26 2021, 15:33:00) 
[GCC 8.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> 
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
KeyboardInterrupt
>>> 

## Load kialo data from scratch 
#### (scroll down if want to use already processed kialo data)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [5]:
kialo_ds_path = '/mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/multi-taks-counter-argument-generation/kialo_data/'

In [6]:
train_kialo_df = pd.read_pickle(kialo_ds_path + '/kialo_train_df.pkl')
valid_kialo_df = pd.read_pickle(kialo_ds_path + '/kialo_valid_df.pkl')
test_kialo_df = pd.read_pickle(kialo_ds_path + '/kialo_test_df.pkl')

In [7]:
def create_df(df):
    
    df = df.groupby('conclusion_text').agg({
        'premises': lambda x: list(x)[0],
        'counter' : lambda x: list(x)
    }).reset_index()
    
    output_data = []

    for idx, row in df.iterrows():
        for premise in row['premises']:
            num_tokens = len(premise.split())
            if  num_tokens <= 200 and num_tokens > 3:
                output_data.append((row['conclusion_text'], premise, 0))

        for counter in row['counter']:
            num_tokens = len(counter.split())
            if  num_tokens <= 200 and num_tokens > 3:
                output_data.append((row['conclusion_text'], counter, 1))

    output_df = pd.DataFrame(output_data, columns=['claim1', 'claim2', 'label'])
    
    #Balancing the dataframe
    g = output_df.groupby('label')
    output_df = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))
    
    return output_df

In [8]:
train_df = create_df(train_kialo_df)
valid_df = create_df(valid_kialo_df)
test_df  = create_df(test_kialo_df)

In [9]:
train_df.label.value_counts()

1    47832
0    47832
Name: label, dtype: int64

In [10]:
valid_df.label.value_counts()

1    3858
0    3858
Name: label, dtype: int64

In [11]:
test_df.label.value_counts()

1    11227
0    11227
Name: label, dtype: int64

In [12]:
train_df.to_csv('../data/kialo_stance_classification_training_data.csv', index=False)
test_df.to_csv('../data/kialo_stance_classification_test_data.csv', index=False)
valid_df.to_csv('../data/kialo_stance_classification_valid_data.csv', index=False)

## Load already processed kialo data for tokenization and training for model

In [1]:
from datasets import Dataset
from transformers import TrainingArguments, RobertaTokenizer, RobertaForSequenceClassification, TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import Trainer

In [2]:
import pandas as pd
train_df = pd.read_csv('../data/kialo_stance_classification_training_data.csv')
test_df  = pd.read_csv('../data/kialo_stance_classification_test_data.csv')
valid_df = pd.read_csv('../data/kialo_stance_classification_valid_data.csv')

### convert df into dataset

In [3]:
train_df['input_txt'] = train_df.apply(lambda x: x['claim1'] + ' </s> ' + x['claim2'], axis=1)
valid_df['input_txt'] = valid_df.apply(lambda x: x['claim1'] + ' </s> ' + x['claim2'], axis=1)


In [4]:
train_df.sample(10).head(n=5)

Unnamed: 0,claim1,claim2,label,input_txt
60236,"From an Atheist perspective, there is no god a...",This is only true is atheism is presumed to be...,1,"From an Atheist perspective, there is no god a..."
54604,The existence of suffering \(natural evil is i...,The best possible world may still logically re...,1,The existence of suffering \(natural evil is i...
53594,Private schools preserve traditions that are a...,Children should not be forced to adopt traditi...,1,Private schools preserve traditions that are a...
4505,Men will never accept a robot as a substitute ...,Sex robots require a level of paraphilia \(Par...,0,Men will never accept a robot as a substitute ...
20963,Judaism,Jews have survived multiple attacks throughout...,0,Judaism </s> Jews have survived multiple attac...


In [5]:
train_dataset = Dataset.from_pandas(train_df.sample(frac=1))
valid_dataset = Dataset.from_pandas(valid_df.sample(frac=1))

## Apply Roberta model

In [6]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    #precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = metric.compute(predictions=preds, references=labels)
    return {
        'accuracy': acc['accuracy'],
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
    }

In [7]:
# tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
# model = RobertaForSequenceClassification.from_pretrained('roberta-large').cuda()
model = AutoModelForSequenceClassification.from_pretrained('roberta-base',num_labels=2).cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [8]:
tokenized_train = train_dataset.map(lambda a: tokenizer(a['input_txt'], padding='max_length', max_length=256, truncation=True),batched=True)
tokenized_valid = valid_dataset.map(lambda a: tokenizer(a['input_txt'], padding='max_length', max_length=256, truncation=True),batched=True)


  0%|          | 0/96 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [9]:
# training_args = TrainingArguments('../data/output/stance_classification', 
#                                   evaluation_strategy="epoch", 
#                                   eval_steps=1000,
#                                   save_steps=4000,
#                                   learning_rate=2e-5,
#                                   weight_decay=0.01,
#                                   save_total_limit=5,
#                                   num_train_epochs=10 , 
#                                   per_device_train_batch_size=8)


args = TrainingArguments(
    # output_dir: directory where the model checkpoints will be saved.
    output_dir='../data/output/stance_classification',
    # evaluation_strategy (default "no"):
    # Possible values are:
    # "no": No evaluation is done during training.
    # "steps": Evaluation is done (and logged) every eval_steps.
    # "epoch": Evaluation is done at the end of each epoch.
    evaluation_strategy="steps",
    # eval_steps: Number of update steps between two evaluations if
    # evaluation_strategy="steps". Will default to the same value as
    # logging_steps if not set.
    eval_steps=200,
    # logging_strategy (default: "steps"): The logging strategy to adopt during
    # training (used to log training loss for example). Possible values are:
    # "no": No logging is done during training.
    # "epoch": Logging is done at the end of each epoch.
    # "steps": Logging is done every logging_steps.
    logging_strategy="steps",
    # logging_steps (default 500): Number of update steps between two logs if
    # logging_strategy="steps".
    logging_steps=200,
    # save_strategy (default "steps"):
    # The checkpoint save strategy to adopt during training. Possible values are:
    # "no": No save is done during training.
    # "epoch": Save is done at the end of each epoch.
    # "steps": Save is done every save_steps (default 500).
    save_strategy="steps",
    # save_steps (default: 500): Number of updates steps before two checkpoint
    # saves if save_strategy="steps".
    save_steps=600,
    # learning_rate (default 5e-5): The initial learning rate for AdamW optimizer.
    # Adam algorithm with weight decay fix as introduced in the paper
    # Decoupled Weight Decay Regularization.
    learning_rate=2e-5,
    # per_device_train_batch_size: The batch size per GPU/TPU core/CPU for training.
    per_device_train_batch_size=64,
    # per_device_eval_batch_size: The batch size per GPU/TPU core/CPU for evaluation.
    per_device_eval_batch_size=64,
    # num_train_epochs (default 3.0): Total number of training epochs to perform
    # (if not an integer, will perform the decimal part percents of the last epoch
    # before stopping training).
    num_train_epochs=1,
    # load_best_model_at_end (default False): Whether or not to load the best model
    # found during training at the end of training.
    load_best_model_at_end=True,
    # metric_for_best_model:
    # Use in conjunction with load_best_model_at_end to specify the metric to use
    # to compare two different models. Must be the name of a metric returned by
    # the evaluation with or without the prefix "eval_".
    metric_for_best_model="accuracy",
    # report_to:
    # The list of integrations to report the results and logs to. Supported
    # platforms are "azure_ml", "comet_ml", "mlflow", "tensorboard" and "wandb".
    # Use "all" to report to all integrations installed, "none" for no integrations.
#     report_to="tensorboard"
)


# trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_valid, compute_metrics=compute_metrics)



trainer = Trainer(
    # Function that returns the model to train. It's useful to use a function
    # instead of directly the model to make sure that we are always training
    # an untrained model from scratch.
    model=model,
    # The training arguments.
    args=args,
    # The training dataset.
    train_dataset=tokenized_train,
    # The evaluation dataset. We use a small subset of the validation set
    # composed of 150 samples to speed up computations...
    eval_dataset=tokenized_valid.shuffle(42),#.select(range(150)),
    # Even though the training set and evaluation set are already tokenized, the
    # tokenizer is needed to pad the "input_ids" and "attention_mask" tensors
    # to the length managed by the model. It does so one batch at a time, to
    # use less memory as possible.
    tokenizer=tokenizer,
    # Function that will be called at the end of each evaluation phase on the whole
    # arrays of predictions/labels to produce metrics.
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: input_txt, claim2, __index_level_0__, claim1.
***** Running training *****
  Num examples = 95664
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1495


Step,Training Loss,Validation Loss,Accuracy
200,0.5795,0.501981,0.75985
400,0.4881,0.52386,0.75985
600,0.4718,0.48484,0.773458
800,0.4555,0.462039,0.780327
1000,0.4387,0.446397,0.791083
1200,0.4347,0.442719,0.796786
1400,0.4275,0.434321,0.796656


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: input_txt, claim2, __index_level_0__, claim1.
***** Running Evaluation *****
  Num examples = 7716
  Batch size = 64
Saving model checkpoint to ../data/output/stance_classification/checkpoint-200
Configuration saved in ../data/output/stance_classification/checkpoint-200/config.json
Model weights saved in ../data/output/stance_classification/checkpoint-200/pytorch_model.bin
tokenizer config file saved in ../data/output/stance_classification/checkpoint-200/tokenizer_config.json
Special tokens file saved in ../data/output/stance_classification/checkpoint-200/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: input_txt, claim2, __index_level_0__, claim1.
***** Running Evaluation *****
  Num examples = 7716
  Batch size = 

TrainOutput(global_step=1495, training_loss=0.4676041695585219, metrics={'train_runtime': 425.0499, 'train_samples_per_second': 225.065, 'train_steps_per_second': 3.517, 'total_flos': 1.258512799997952e+16, 'train_loss': 0.4676041695585219, 'epoch': 1.0})

In [11]:
trainer.save_model('./saved/model')

Saving model checkpoint to ./saved/model
Configuration saved in ./saved/model/config.json
Model weights saved in ./saved/model/pytorch_model.bin
tokenizer config file saved in ./saved/model/tokenizer_config.json
Special tokens file saved in ./saved/model/special_tokens_map.json


In [12]:
model = AutoModelForSequenceClassification.from_pretrained('./saved/model').cuda()

loading configuration file ./saved/model/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.11.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file ./saved/model/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForSequenceClassification.


In [13]:
test_df['input_txt'] = test_df.apply(lambda x: x['claim1'] + ' </s> ' + x['claim2'], axis=1)
test_dataset = Dataset.from_pandas(test_df.sample(frac=1))
tokenized_test = test_dataset.map(lambda a: tokenizer(a['input_txt'], padding='max_length', max_length=256, truncation=True),batched=True)

pred=trainer.predict(tokenized_test)
pred

  0%|          | 0/23 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: input_txt, claim2, __index_level_0__, claim1.
***** Running Prediction *****
  Num examples = 22454
  Batch size = 64


PredictionOutput(predictions=array([[ 0.25865212, -0.24357605],
       [-1.9126097 ,  1.9990443 ],
       [ 0.03257468,  0.10451257],
       ...,
       [ 0.34530902, -0.31135717],
       [ 2.0949996 , -2.0372036 ],
       [-1.9549632 ,  2.1160939 ]], dtype=float32), label_ids=array([0, 1, 1, ..., 0, 0, 1]), metrics={'test_loss': 0.42118096351623535, 'test_accuracy': 0.8068050236038122, 'test_runtime': 27.4998, 'test_samples_per_second': 816.516, 'test_steps_per_second': 12.764})

In [14]:
scores = np.argmax(pred.predictions, axis=1)

In [15]:
metric.compute(predictions=scores, references=tokenized_test['label'])

{'accuracy': 0.8068050236038122}