In [3]:
import torch
import numpy as np
import pandas as pd
from transformers import BertModel
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, AutoTokenizer
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from transformers import AutoModelForSequenceClassification, Trainer, TrainerCallback, TrainingArguments, DataCollatorWithPadding
import copy

In [4]:
train_data = pd.read_csv('input/arguments-training.tsv', sep="\t")
val_data = pd.read_csv('input/arguments-validation.tsv', sep="\t")
train_labels = pd.read_csv('input/labels-training.tsv', sep="\t")
val_labels = pd.read_csv('input/labels-validation.tsv', sep="\t")

In [5]:
train_data['combined1'] = train_data.Conclusion.str.cat(train_data.Stance, sep=' [SEP] ')
train_data['combined2'] = train_data.combined1.str.cat(train_data.Premise, sep=' [SEP] ')
val_data['combined1'] = val_data.Conclusion.str.cat(val_data.Stance, sep=' [SEP] ')
val_data['combined2'] = val_data.combined1.str.cat(val_data.Premise, sep=' [SEP] ')

In [6]:
combined = train_data.combined2.values.tolist()
val_combined = val_data.combined2.values.tolist()
label_list = train_labels.values.tolist()
val_label_list = val_labels.values.tolist()
[x[3:5] for x in [combined, val_combined, label_list, val_label_list]]

[['We should ban naturopathy [SEP] against [SEP] it provides a useful income for some people',
  'We should ban fast food [SEP] in favor of [SEP] fast food should be banned because it is really bad for your health and is costly.'],
 ["Surrogacy should be banned [SEP] against [SEP] Surrogacy should not be banned as it is the woman's right to choose if she wishes to do this for another couple and be compensated.",
  'Entrapment should be legalized [SEP] against [SEP] entrapment is gravely immoral and against human rights to coerce someone into a crime'],
 [['A01004', 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  ['A01005', 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [['A02002', 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  ['A02009', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1]]]

In [7]:
def gen_tokens(combined, val_combined): #best to functionalize, this function is quite slow so lets not call it on every kernel restart.
    symbols = []
    for each in combined:
        ids = tok(each, add_special_tokens=True, padding="max_length", return_tensors='pt')
        symbols.append(ids)

    val_symbols = []
    for each in val_combined:
        ids = tok(each, add_special_tokens=True, padding="max_length", return_tensors='pt')
        val_symbols.append(ids)

    bert = BertModel.from_pretrained("prajjwal1/bert-small")

    X = []
    for s in symbols:
        encoded_sequence = bert(**s)
        bert_output = encoded_sequence.pooler_output
        output = torch.unsqueeze(bert_output,1)
        X.append(output[0][0].tolist())
    print(len(X),X[0])
    X_val = []
    for s in val_symbols:
        encoded_sequence = bert(**s)
        bert_output = encoded_sequence.pooler_output
        output = torch.unsqueeze(bert_output,1)
        X_val.append(output[0][0].tolist())
    print(np.array(label_list).shape)
    
    clf = MultiOutputClassifier(LogisticRegression(max_iter=1000)).fit(X, np.array(label_list)) 
    p = clf.predict(X_val)
    print(classification_report(val_label_list, p))
    return p 
# gen_tokens(combined, val_combined)

Lets generate a list of labels to feed into our model.

In [8]:
#lets generate Dataset objects so we can feed it into huggingface.Trainer
valid_data = val_data.merge(val_labels, how = 'left', on = 'Argument ID')
t_data = train_data.merge(train_labels, how = 'left', on = 'Argument ID')

if 'Argument ID' in valid_data.columns: valid_data.drop(['Argument ID', 'Conclusion', 'Stance', 'Premise', 'combined1'], axis=1, inplace=True)
if 'Argument ID' in t_data.columns: t_data.drop(['Argument ID', 'Conclusion', 'Stance', 'Premise', 'combined1'],axis=1, inplace=True)
valid_data.columns, t_data.columns

(Index(['combined2', 'Self-direction: thought', 'Self-direction: action',
        'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance',
        'Power: resources', 'Face', 'Security: personal', 'Security: societal',
        'Tradition', 'Conformity: rules', 'Conformity: interpersonal',
        'Humility', 'Benevolence: caring', 'Benevolence: dependability',
        'Universalism: concern', 'Universalism: nature',
        'Universalism: tolerance', 'Universalism: objectivity'],
       dtype='object'),
 Index(['combined2', 'Self-direction: thought', 'Self-direction: action',
        'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance',
        'Power: resources', 'Face', 'Security: personal', 'Security: societal',
        'Tradition', 'Conformity: rules', 'Conformity: interpersonal',
        'Humility', 'Benevolence: caring', 'Benevolence: dependability',
        'Universalism: concern', 'Universalism: nature',
        'Universalism: tolerance', 'Universalism: objectivity

In [9]:
#huggingface dataset.from_pandas for ex.

train_to_dataset = Dataset.from_pandas(t_data)
val_to_dataset = Dataset.from_pandas(valid_data)
dataset = DatasetDict({ "train": train_to_dataset, "validation": val_to_dataset})
labels = list(valid_data.columns)
labels = labels[1:] #ignore combined2
print(labels)
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}


['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources', 'Face', 'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules', 'Conformity: interpersonal', 'Humility', 'Benevolence: caring', 'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity']


In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#from huggingface docu
def preprocess(items):
  combined = items['combined2']
  encoding = tokenizer(combined, padding="max_length", truncation=True, max_length=128)

  l_b = {k: items[k] for k in items.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  print(len(combined))
  labels_mat = np.zeros((len(combined), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_mat[:, idx] = l_b[label]

  encoding["labels"] = labels_mat.tolist()
  
  return encoding

In [26]:
dataset_cop = copy.deepcopy(dataset)
encoded_dataset = dataset_cop.map(preprocess, batched=True,remove_columns=dataset['train'].column_names)

  0%|          | 0/6 [00:00<?, ?ba/s]

1000
1000
1000
1000
1000
220


  0%|          | 0/2 [00:00<?, ?ba/s]

1000
896


dict

In [29]:
type(encoded_dataset['train'])

datasets.arrow_dataset.Dataset

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [56]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
def compute_metrics(eval):
    result = mlmetrics(
        pred=eval.predictions, 
        y_true=eval.label_ids)
    return result

def mlmetrics(pred, y_true):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(pred))

    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= 0.5)] = 1
    #can't sklearn.metrics.classification_report as huggingface expects string:int mapping, vs string:string.
    precision_micro_average = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall_micro_average = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)

    metrics = {'p': precision_micro_average,
               'r': recall_micro_average,
               'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

In [57]:
test_train, test_valid = t_data[0:2], valid_data[0:2]
td = Dataset.from_pandas(test_train)
vd = Dataset.from_pandas(test_valid)
datasetTest = DatasetDict({ "train": td, "validation": vd})
e_d = datasetTest.map(preprocess, batched=True,remove_columns=datasetTest['train'].column_names)
e_d
# datasetTest['train'].column_names

  0%|          | 0/1 [00:00<?, ?ba/s]

2


  0%|          | 0/1 [00:00<?, ?ba/s]

2


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})

In [64]:
LEARNING_RATE, BATCH_SIZE, EPOCHS = 1e-4, 8, 2

training_args = TrainingArguments(
    output_dir="camembert-fine-tuned",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [65]:
trainer.train()

***** Running training *****
  Num examples = 5220
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1306
  Number of trainable parameters = 109497620


  0%|          | 0/1306 [00:00<?, ?it/s]

{'loss': 0.3041, 'learning_rate': 6.171516079632466e-05, 'epoch': 0.77}


***** Running Evaluation *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

Saving model checkpoint to camembert-fine-tuned/checkpoint-653
Configuration saved in camembert-fine-tuned/checkpoint-653/config.json


{'eval_loss': 0.29577159881591797, 'eval_p': 0.7581536760641239, 'eval_r': 0.43135713162446926, 'eval_f1': 0.5498646887842037, 'eval_roc_auc': 0.7018165208833668, 'eval_accuracy': 0.10917721518987342, 'eval_runtime': 76.3151, 'eval_samples_per_second': 24.844, 'eval_steps_per_second': 3.106, 'epoch': 1.0}


Model weights saved in camembert-fine-tuned/checkpoint-653/pytorch_model.bin


{'loss': 0.2641, 'learning_rate': 2.343032159264931e-05, 'epoch': 1.53}


***** Running Evaluation *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

Saving model checkpoint to camembert-fine-tuned/checkpoint-1306
Configuration saved in camembert-fine-tuned/checkpoint-1306/config.json


{'eval_loss': 0.2574668824672699, 'eval_p': 0.7941642162406696, 'eval_r': 0.5521308381821041, 'eval_f1': 0.65139146567718, 'eval_roc_auc': 0.7616488923650294, 'eval_accuracy': 0.1719409282700422, 'eval_runtime': 77.0235, 'eval_samples_per_second': 24.616, 'eval_steps_per_second': 3.077, 'epoch': 2.0}


Model weights saved in camembert-fine-tuned/checkpoint-1306/pytorch_model.bin
Deleting older checkpoint [camembert-fine-tuned/checkpoint-2] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from camembert-fine-tuned/checkpoint-1306 (score: 0.65139146567718).


{'train_runtime': 1585.2675, 'train_samples_per_second': 6.586, 'train_steps_per_second': 0.824, 'train_loss': 0.2744884461757783, 'epoch': 2.0}


TrainOutput(global_step=1306, training_loss=0.2744884461757783, metrics={'train_runtime': 1585.2675, 'train_samples_per_second': 6.586, 'train_steps_per_second': 0.824, 'train_loss': 0.2744884461757783, 'epoch': 2.0})

In [66]:
results = trainer.evaluate()


***** Running Evaluation *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

In [67]:
results

{'eval_loss': 0.2574668824672699,
 'eval_p': 0.7941642162406696,
 'eval_r': 0.5521308381821041,
 'eval_f1': 0.65139146567718,
 'eval_roc_auc': 0.7616488923650294,
 'eval_accuracy': 0.1719409282700422,
 'eval_runtime': 74.5473,
 'eval_samples_per_second': 25.434,
 'eval_steps_per_second': 3.179,
 'epoch': 2.0}