## Load Data

In [1]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    EvalPrediction,
    set_seed,
    DataCollator
)
from typing import Callable, Dict, Optional, List
import logging
import os
import random
import time
import math
from dataclasses import dataclass, field
import numpy as np
from torch.utils.tensorboard import SummaryWriter


logging.basicConfig(level=logging.INFO)


set_seed(37)

## Hyperparameters

In [2]:
SAVED_DIR_NAME = 'DP_EDA'
PRETRAINED_MODEL = 'bert-base-uncased'
NUM_LABELS = 2
PAD_MAX_LEN = 65
BATCH_SIZE = 128
MAX_EPOCH = 5

## instance

In [3]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=NUM_LABELS)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/tidarren1020/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/tidarren1020/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,

In [4]:
# short_jokes_path = './RedditHumorDetection/full_datasets/short_jokes/data/shortjokes.csv'
# short_jokes_train_path = './RedditHumorDetection/data/short_jokes/train.tsv'
# short_jokes_test_path = './RedditHumorDetection/data/short_jokes/test.tsv'
# short_jokes_dev_path = './RedditHumorDetection/data/short_jokes/dev.tsv'

## Dataset

In [5]:
puns_train_path = './RedditHumorDetection/data/puns/train.tsv'
puns_test_path = './RedditHumorDetection/data/puns/test.tsv'
puns_dev_path = './RedditHumorDetection/data/puns/dev.tsv'

## Data Augmentation: Paragraph Decmposition

In [6]:
def load_data(data_path):
    df = pd.read_csv(data_path, header=None, names=['id','label','a','text'])
    df = df[['label','text']]
    return df

In [7]:
def paragraph_decmposition(data_path):
    df = load_data(data_path)
    
    dataAug = []
    for _id,row in df.iterrows():
        text = row['text']
        label = row['label']

        tokens = text.split()

        for i in range(2,len(tokens)):
            text_a = ' '.join(tokens[:i])
            text_b = ' '.join(tokens[i:])

            d  = {'text_a':text_a, 'text_b':text_b, 'label':label, 'origin_id':_id}

            dataAug.append(d)
    
    df_dataAug = pd.DataFrame(dataAug)
    print('=== Data Augmentation: paragraph decmposition ===')
    print('[Before]')
    print('# of label=0:',sum(df.label==0))
    print('# of label=1:',sum(df.label==1))
    print('\n[After]')
    print('# of label=0:',sum(df_dataAug.label==0))
    print('# of label=1:',sum(df_dataAug.label==1))
    print('=== end ===')
    return df_dataAug

In [8]:
class HumorDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, df, tokenizer, mode='train'):
        self.df = df
        self.len = len(self.df)
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        self.mode = mode
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode=='train':
            text_a = self.df.loc[idx, 'text_a']
            text_b = self.df.loc[idx, 'text_b']
            inputDict = tokenizer.encode_plus(text_a, text_b)
        else:
            text_a = self.df.loc[idx, 'text']
            inputDict = tokenizer.encode_plus(text_a)
        
        label = self.df.loc[idx, 'label']
        inputDict['label'] = label
        
        return inputDict
    
    def __len__(self):
        return self.len

In [9]:
def pad_seq(seq, max_batch_len, pad_value):
    return seq + (max_batch_len - len(seq)) * [pad_value]

class Collator(DataCollator):
    def __init__(self, pad_token_id):
        self.pad_token_id = pad_token_id

    def collate_batch(self, batch):
        batch_inputs = list()
        batch_attention_masks = list()
        batch_token_type_ids = list()
        labels = list()
        max_size = max([len(ex['input_ids']) for ex in batch])
        for item in batch:
            batch_inputs += [pad_seq(item['input_ids'], max_size, self.pad_token_id)]
            batch_attention_masks += [pad_seq(item['attention_mask'], max_size, 0)]
            batch_token_type_ids += [pad_seq(item['token_type_ids'], max_size, 0)]
            labels.append(item['label'])

        return {"input_ids": torch.tensor(batch_inputs, dtype=torch.long),
                "attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long),
                "token_type_ids": torch.tensor(batch_token_type_ids, dtype=torch.long),
                "labels": torch.tensor(labels, dtype=torch.long)
                }

## Load Augmented Data

In [10]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action
import os

## Synonym Replacement

In [11]:
text = 'an elevator makes ghosts happy because it lifts'

aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text, n=5)
print("Original:")
print(text)
print("Augmented Text:")
for t in augmented_text:
    print(t)

Original:
an elevator makes ghosts happy because it lifts
Augmented Text:
an elevator makes ghosts happy because information technology lifts
an elevator makes ghost happy because information technology lifts
an elevator makes ghosts glad because information technology lifts
an elevator makes ghosts felicitous because information technology lifts
an lift have ghosts happy because it lifts


In [12]:
def synonym_replacement():
    aug = naw.SynonymAug(aug_src='wordnet')
    data_synonym_replacement = []
    df_puns_punchline = pd.read_csv('df_puns_punchline.csv')
    for i,row in df_puns_punchline.iterrows():
        text_a = row['text_a']
        augmented_text = aug.augment(text_a, n=5)
        d = {k:v for k,v in row.items()}
        for text in augmented_text:
            d_tmp = d.copy()
            d_tmp['text_a'] = text
            data_synonym_replacement.append(d_tmp)
    df = pd.DataFrame(data_synonym_replacement)
    return df

In [13]:
# df_puns_sr = synonym_replacement()
df_puns_sr = pd.read_csv('df_puns_sr.csv')

In [14]:
df_puns_sr.head(30)

Unnamed: 0,label,origin_id,punchline_idx,text_a,text_b
0,0,0,7,One m hoping they ll come and see this,and say We have to have this
1,0,0,7,I m skip they ll come and see this,and say We have to have this
2,0,0,7,1 m hop skip they ll come and see this,and say We have to have this
3,0,0,7,Iodin m hoping they ll come and see this,and say We have to have this
4,0,0,7,I m hoping they ll come and take in this,and say We have to have this
5,1,1,15,a man world health organization cannot take th...,illiterate
6,1,1,15,a piece world health organization cannot read ...,illiterate
7,1,1,15,a world world health organization cannot show ...,illiterate
8,1,1,15,a human race world health organization cannot ...,illiterate
9,1,1,15,a man who cannot scan the sign that warn peopl...,illiterate


In [17]:
df_puns_pd = pd.read_csv('df_puns_pd.csv')
df_puns_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41681 entries, 0 to 41680
Data columns (total 4 columns):
label        41681 non-null int64
origin_id    41681 non-null int64
text_a       41681 non-null object
text_b       41681 non-null object
dtypes: int64(2), object(2)
memory usage: 1.3+ MB


In [18]:
df_concat = pd.concat([df_puns_pd, df_puns_sr], sort=False).reset_index(drop=True)
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58834 entries, 0 to 58833
Data columns (total 5 columns):
label            58834 non-null int64
origin_id        58834 non-null int64
text_a           58834 non-null object
text_b           58834 non-null object
punchline_idx    17153 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.2+ MB


In [19]:
df_concat.head()

Unnamed: 0,label,origin_id,text_a,text_b,punchline_idx
0,0,0,I m,hoping they ll come and see this and say We ha...,
1,0,0,I m hoping,they ll come and see this and say We have to h...,
2,0,0,I m hoping they,ll come and see this and say We have to have this,
3,0,0,I m hoping they ll,come and see this and say We have to have this,
4,0,0,I m hoping they ll come,and see this and say We have to have this,


In [20]:
df_puns_dev = load_data(puns_dev_path)

train_dataset = HumorDataset(df_concat, tokenizer)
eval_dataset = HumorDataset(df_puns_dev, tokenizer, mode='dev')

## Trainer

In [21]:
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": (preds == p.label_ids).mean()}

In [22]:
NUM_TRAINSET = len(train_dataset)
LOGGING_STEPS = math.ceil(NUM_TRAINSET/BATCH_SIZE)

training_args = TrainingArguments(
    output_dir="./models/{}".format(SAVED_DIR_NAME),
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=MAX_EPOCH,
    logging_steps=LOGGING_STEPS,
    logging_first_step=True,
    save_steps=LOGGING_STEPS,
    evaluate_during_training=True,
    logging_dir="./logs",
    #learning_rate=2e-5,
)

In [23]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=Collator(pad_token_id=tokenizer.pad_token_id),
        compute_metrics=compute_metrics,
        tb_writer=SummaryWriter(log_dir='logs', flush_secs=10),
    )

INFO:transformers.training_args:PyTorch: setting up devices
INFO:transformers.trainer:You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.


## train

In [24]:
trainer.train()

INFO:transformers.trainer:***** Running training *****
INFO:transformers.trainer:  Num examples = 58834
INFO:transformers.trainer:  Num Epochs = 5
INFO:transformers.trainer:  Instantaneous batch size per device = 128
INFO:transformers.trainer:  Total train batch size (w. parallel, distributed & accumulation) = 128
INFO:transformers.trainer:  Gradient Accumulation steps = 1
INFO:transformers.trainer:  Total optimization steps = 2300


HBox(children=(IntProgress(value=0, description='Epoch', max=5, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=460, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.0014368480962255728, "learning_rate": 4.9978260869565216e-05, "epoch": 0.002173913043478261, "step": 1}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

{"eval_loss": 0.6584379196166992, "eval_acc": 0.5920398009950248, "epoch": 0.002173913043478261, "step": 1}


INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.07724789623677245, "learning_rate": 4e-05, "epoch": 1.0, "step": 460}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/DP_EDA/checkpoint-460
INFO:transformers.configuration_utils:Configuration saved in ./models/DP_EDA/checkpoint-460/config.json


{"eval_loss": 0.5746870756149292, "eval_acc": 0.8971807628524047, "epoch": 1.0, "step": 460}


INFO:transformers.modeling_utils:Model weights saved in ./models/DP_EDA/checkpoint-460/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=460, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.0040636494875640064, "learning_rate": 3e-05, "epoch": 2.0, "step": 920}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/DP_EDA/checkpoint-920
INFO:transformers.configuration_utils:Configuration saved in ./models/DP_EDA/checkpoint-920/config.json


{"eval_loss": 0.5140550553798675, "eval_acc": 0.9237147595356551, "epoch": 2.0, "step": 920}


INFO:transformers.modeling_utils:Model weights saved in ./models/DP_EDA/checkpoint-920/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=460, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.0015376685281808023, "learning_rate": 2e-05, "epoch": 3.0, "step": 1380}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/DP_EDA/checkpoint-1380
INFO:transformers.configuration_utils:Configuration saved in ./models/DP_EDA/checkpoint-1380/config.json


{"eval_loss": 0.520938640832901, "eval_acc": 0.9286898839137645, "epoch": 3.0, "step": 1380}


INFO:transformers.modeling_utils:Model weights saved in ./models/DP_EDA/checkpoint-1380/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=460, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.00019834798336966703, "learning_rate": 1e-05, "epoch": 4.0, "step": 1840}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/DP_EDA/checkpoint-1840
INFO:transformers.configuration_utils:Configuration saved in ./models/DP_EDA/checkpoint-1840/config.json


{"eval_loss": 0.636886739730835, "eval_acc": 0.9220563847429519, "epoch": 4.0, "step": 1840}


INFO:transformers.modeling_utils:Model weights saved in ./models/DP_EDA/checkpoint-1840/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=460, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.00018896343141497473, "learning_rate": 0.0, "epoch": 5.0, "step": 2300}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/DP_EDA/checkpoint-2300
INFO:transformers.configuration_utils:Configuration saved in ./models/DP_EDA/checkpoint-2300/config.json


{"eval_loss": 0.628925359249115, "eval_acc": 0.9203980099502488, "epoch": 5.0, "step": 2300}


INFO:transformers.modeling_utils:Model weights saved in ./models/DP_EDA/checkpoint-2300/pytorch_model.bin
INFO:transformers.trainer:

Training completed. Do not forget to share your model on huggingface.co/models =)







TrainOutput(global_step=2300, training_loss=0.016934674752705495)

In [25]:
result = trainer.evaluate()

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…


{"eval_loss": 0.628925359249115, "eval_acc": 0.9203980099502488, "epoch": 5.0, "step": 2300}
