## Load Data

In [1]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    EvalPrediction,
    set_seed,
    DataCollator
)
from typing import Callable, Dict, Optional, List
import logging
import os
import random
import time
import math
from dataclasses import dataclass, field
import numpy as np
from torch.utils.tensorboard import SummaryWriter


logging.basicConfig(level=logging.INFO)


set_seed(37)

## Hyperparameters

In [2]:
PRETRAINED_MODEL = 'bert-base-uncased'
NUM_LABELS = 2
PAD_MAX_LEN = 65
BATCH_SIZE = 256
MAX_EPOCH = 5

## instance

In [3]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=NUM_LABELS)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/tidarren1020/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/tidarren1020/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,

In [4]:
# short_jokes_path = './RedditHumorDetection/full_datasets/short_jokes/data/shortjokes.csv'
# short_jokes_train_path = './RedditHumorDetection/data/short_jokes/train.tsv'
# short_jokes_test_path = './RedditHumorDetection/data/short_jokes/test.tsv'
# short_jokes_dev_path = './RedditHumorDetection/data/short_jokes/dev.tsv'

## Dataset

In [5]:
puns_train_path = './RedditHumorDetection/data/puns/train.tsv'
puns_test_path = './RedditHumorDetection/data/puns/test.tsv'
puns_dev_path = './RedditHumorDetection/data/puns/dev.tsv'

In [6]:
class HumorDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, data_path, tokenizer, dataAug='None'):
        self.data_path = data_path
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv(data_path, header=None, names=['id','label','a','text'])
        self.df = self.df[['label','text']]
        self.len = len(self.df)
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        self.dataAug = dataAug
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        text = self.df.loc[idx, 'text']
        label = self.df.loc[idx, 'label']
            
        if self.dataAug=='None':
            inputDict = tokenizer.encode_plus(text)
            inputDict['label'] = label
            
            return inputDict

    
    def __len__(self):
        return self.len

In [7]:
def pad_seq(seq, max_batch_len, pad_value):
    return seq + (max_batch_len - len(seq)) * [pad_value]

class Collator(DataCollator):
    def __init__(self, pad_token_id):
        self.pad_token_id = pad_token_id

    def collate_batch(self, batch):
        batch_inputs = list()
        batch_attention_masks = list()
        labels = list()
        max_size = max([len(ex['input_ids']) for ex in batch])
        for item in batch:
            batch_inputs += [pad_seq(item['input_ids'], max_size, self.pad_token_id)]
            batch_attention_masks += [pad_seq(item['attention_mask'], max_size, 0)]
            labels.append(item['label'])

        return {"input_ids": torch.tensor(batch_inputs, dtype=torch.long),
                "attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long),
                "labels": torch.tensor(labels, dtype=torch.long)
                }

In [8]:
train_dataset = HumorDataset(puns_train_path, tokenizer)
eval_dataset = HumorDataset(puns_dev_path, tokenizer)

## Trainer

In [9]:
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": (preds == p.label_ids).mean()}

In [10]:
NUM_TRAINSET = len(train_dataset)
LOGGING_STEPS = math.ceil(NUM_TRAINSET/BATCH_SIZE)

training_args = TrainingArguments(
    output_dir="./models/baseline",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=MAX_EPOCH,
    logging_steps=LOGGING_STEPS,
    logging_first_step=True,
    save_steps=LOGGING_STEPS,
    evaluate_during_training=True,
    logging_dir="./logs",
    #learning_rate=2e-5,
)

In [11]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=Collator(pad_token_id=tokenizer.pad_token_id),
        compute_metrics=compute_metrics,
        tb_writer=SummaryWriter(log_dir='logs', flush_secs=10),
    )

INFO:transformers.training_args:PyTorch: setting up devices
INFO:transformers.trainer:You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.


## train

In [12]:
trainer.train()

INFO:transformers.trainer:***** Running training *****
INFO:transformers.trainer:  Num examples = 3619
INFO:transformers.trainer:  Num Epochs = 5
INFO:transformers.trainer:  Instantaneous batch size per device = 256
INFO:transformers.trainer:  Total train batch size (w. parallel, distributed & accumulation) = 256
INFO:transformers.trainer:  Gradient Accumulation steps = 1
INFO:transformers.trainer:  Total optimization steps = 75


HBox(children=(IntProgress(value=0, description='Epoch', max=5, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=15, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 256


{"loss": 0.045486176013946535, "learning_rate": 4.933333333333334e-05, "epoch": 0.06666666666666667, "step": 1}


HBox(children=(IntProgress(value=0, description='Evaluation', max=3, style=ProgressStyle(description_width='in…

{"eval_loss": 0.6839236815770467, "eval_acc": 0.5373134328358209, "epoch": 0.06666666666666667, "step": 1}


INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 256


{"loss": 0.3989637017250061, "learning_rate": 4e-05, "epoch": 1.0, "step": 15}


HBox(children=(IntProgress(value=0, description='Evaluation', max=3, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/baseline/checkpoint-15
INFO:transformers.configuration_utils:Configuration saved in ./models/baseline/checkpoint-15/config.json


{"eval_loss": 0.29926353693008423, "eval_acc": 0.8855721393034826, "epoch": 1.0, "step": 15}


INFO:transformers.modeling_utils:Model weights saved in ./models/baseline/checkpoint-15/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=15, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 256


{"loss": 0.20255417625109354, "learning_rate": 3e-05, "epoch": 2.0, "step": 30}


HBox(children=(IntProgress(value=0, description='Evaluation', max=3, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/baseline/checkpoint-30
INFO:transformers.configuration_utils:Configuration saved in ./models/baseline/checkpoint-30/config.json


{"eval_loss": 0.24340736865997314, "eval_acc": 0.9104477611940298, "epoch": 2.0, "step": 30}


INFO:transformers.modeling_utils:Model weights saved in ./models/baseline/checkpoint-30/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=15, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 256


{"loss": 0.11246620814005534, "learning_rate": 2e-05, "epoch": 3.0, "step": 45}


HBox(children=(IntProgress(value=0, description='Evaluation', max=3, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/baseline/checkpoint-45
INFO:transformers.configuration_utils:Configuration saved in ./models/baseline/checkpoint-45/config.json


{"eval_loss": 0.2704348564147949, "eval_acc": 0.912106135986733, "epoch": 3.0, "step": 45}


INFO:transformers.modeling_utils:Model weights saved in ./models/baseline/checkpoint-45/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=15, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 256


{"loss": 0.05951941758394241, "learning_rate": 1e-05, "epoch": 4.0, "step": 60}


HBox(children=(IntProgress(value=0, description='Evaluation', max=3, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/baseline/checkpoint-60
INFO:transformers.configuration_utils:Configuration saved in ./models/baseline/checkpoint-60/config.json


{"eval_loss": 0.2797339806954066, "eval_acc": 0.9187396351575456, "epoch": 4.0, "step": 60}


INFO:transformers.modeling_utils:Model weights saved in ./models/baseline/checkpoint-60/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=15, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 256


{"loss": 0.04176846531530221, "learning_rate": 0.0, "epoch": 5.0, "step": 75}


HBox(children=(IntProgress(value=0, description='Evaluation', max=3, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/baseline/checkpoint-75
INFO:transformers.configuration_utils:Configuration saved in ./models/baseline/checkpoint-75/config.json


{"eval_loss": 0.29341379801432294, "eval_acc": 0.912106135986733, "epoch": 5.0, "step": 75}


INFO:transformers.modeling_utils:Model weights saved in ./models/baseline/checkpoint-75/pytorch_model.bin
INFO:transformers.trainer:

Training completed. Do not forget to share your model on huggingface.co/models =)







TrainOutput(global_step=75, training_loss=0.17215162900586922)

In [13]:
result = trainer.evaluate()

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 256


HBox(children=(IntProgress(value=0, description='Evaluation', max=3, style=ProgressStyle(description_width='in…


{"eval_loss": 0.29341379801432294, "eval_acc": 0.912106135986733, "epoch": 5.0, "step": 75}


In [14]:
# short_jokes_dev_path = './RedditHumorDetection/data/short_jokes/dev.tsv'
# short_joke_dev = HumorDataset(short_jokes_dev_path, tokenizer)

In [15]:
# trainer.evaluate(short_joke_dev)

## load checkpoint
should tokenizer be saved as well?

In [21]:
model = BertForSequenceClassification.from_pretrained('./models/baseline/checkpoint-30/')

INFO:transformers.configuration_utils:loading configuration file ./models/baseline/checkpoint-30/config.json
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file ./models/baseline/checkpoint-30/pytorch_model.bin
