## Load Data

In [1]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    EvalPrediction,
    set_seed,
    DataCollator
)
from typing import Callable, Dict, Optional, List
import logging
import os
import random
import time
import math
from dataclasses import dataclass, field
import numpy as np
from torch.utils.tensorboard import SummaryWriter


logging.basicConfig(level=logging.INFO)


set_seed(37)

## Hyperparameters

In [2]:
SAVED_DIR_NAME = 'parapragh_decomposition'
PRETRAINED_MODEL = 'bert-base-uncased'
NUM_LABELS = 2
PAD_MAX_LEN = 65
BATCH_SIZE = 128
MAX_EPOCH = 5

## instance

In [3]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=NUM_LABELS)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/tidarren1020/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/tidarren1020/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,

In [4]:
# short_jokes_path = './RedditHumorDetection/full_datasets/short_jokes/data/shortjokes.csv'
# short_jokes_train_path = './RedditHumorDetection/data/short_jokes/train.tsv'
# short_jokes_test_path = './RedditHumorDetection/data/short_jokes/test.tsv'
# short_jokes_dev_path = './RedditHumorDetection/data/short_jokes/dev.tsv'

## Dataset

In [5]:
puns_train_path = './RedditHumorDetection/data/puns/train.tsv'
puns_test_path = './RedditHumorDetection/data/puns/test.tsv'
puns_dev_path = './RedditHumorDetection/data/puns/dev.tsv'

## Data Augmentation: Paragraph Decmposition

In [6]:
def load_data(data_path):
    df = pd.read_csv(data_path, header=None, names=['id','label','a','text'])
    df = df[['label','text']]
    return df

In [7]:
def paragraph_decmposition(data_path):
    df = load_data(data_path)
    
    dataAug = []
    for _id,row in df.iterrows():
        text = row['text']
        label = row['label']

        tokens = text.split()

        for i in range(2,len(tokens)):
            text_a = ' '.join(tokens[:i])
            text_b = ' '.join(tokens[i:])

            d  = {'text_a':text_a, 'text_b':text_b, 'label':label, 'origin_id':_id}

            dataAug.append(d)
    
    df_dataAug = pd.DataFrame(dataAug)
    print('=== Data Augmentation: paragraph decmposition ===')
    print('[Before]')
    print('# of label=0:',sum(df.label==0))
    print('# of label=1:',sum(df.label==1))
    print('\n[After]')
    print('# of label=0:',sum(df_dataAug.label==0))
    print('# of label=1:',sum(df_dataAug.label==1))
    print('=== end ===')
    return df_dataAug

In [8]:
class HumorDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, df, tokenizer, mode='train'):
        self.df = df
        self.len = len(self.df)
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        self.mode = mode
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode=='train':
            text_a = self.df.loc[idx, 'text_a']
            text_b = self.df.loc[idx, 'text_b']
            inputDict = tokenizer.encode_plus(text_a, text_b)
        else:
            text_a = self.df.loc[idx, 'text']
            inputDict = tokenizer.encode_plus(text_a)
        
        label = self.df.loc[idx, 'label']
        inputDict['label'] = label
        
        return inputDict
    
    def __len__(self):
        return self.len

In [9]:
def pad_seq(seq, max_batch_len, pad_value):
    return seq + (max_batch_len - len(seq)) * [pad_value]

class Collator(DataCollator):
    def __init__(self, pad_token_id):
        self.pad_token_id = pad_token_id

    def collate_batch(self, batch):
        batch_inputs = list()
        batch_attention_masks = list()
        batch_token_type_ids = list()
        labels = list()
        max_size = max([len(ex['input_ids']) for ex in batch])
        for item in batch:
            batch_inputs += [pad_seq(item['input_ids'], max_size, self.pad_token_id)]
            batch_attention_masks += [pad_seq(item['attention_mask'], max_size, 0)]
            batch_token_type_ids += [pad_seq(item['token_type_ids'], max_size, 0)]
            labels.append(item['label'])

        return {"input_ids": torch.tensor(batch_inputs, dtype=torch.long),
                "attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long),
                "token_type_ids": torch.tensor(batch_token_type_ids, dtype=torch.long),
                "labels": torch.tensor(labels, dtype=torch.long)
                }

In [10]:
df_puns_train_dataAug = paragraph_decmposition(puns_train_path)
df_puns_dev = load_data(puns_dev_path)

train_dataset = HumorDataset(df_puns_train_dataAug, tokenizer)
eval_dataset = HumorDataset(df_puns_dev, tokenizer, mode='dev')

=== Data Augmentation: paragraph decmposition ===
[Before]
# of label=0: 1810
# of label=1: 1809

[After]
# of label=0: 21403
# of label=1: 20278
=== end ===


## Trainer

In [11]:
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": (preds == p.label_ids).mean()}

In [12]:
NUM_TRAINSET = len(train_dataset)
LOGGING_STEPS = math.ceil(NUM_TRAINSET/BATCH_SIZE)

training_args = TrainingArguments(
    output_dir="./models/{}".format(SAVED_DIR_NAME),
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=MAX_EPOCH,
    logging_steps=LOGGING_STEPS,
    logging_first_step=True,
    save_steps=LOGGING_STEPS,
    evaluate_during_training=True,
    logging_dir="./logs",
    #learning_rate=2e-5,
)

In [13]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=Collator(pad_token_id=tokenizer.pad_token_id),
        compute_metrics=compute_metrics,
        tb_writer=SummaryWriter(log_dir='logs', flush_secs=10),
    )

INFO:transformers.training_args:PyTorch: setting up devices
INFO:transformers.trainer:You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.


## train

In [14]:
trainer.train()

INFO:transformers.trainer:***** Running training *****
INFO:transformers.trainer:  Num examples = 41681
INFO:transformers.trainer:  Num Epochs = 5
INFO:transformers.trainer:  Instantaneous batch size per device = 128
INFO:transformers.trainer:  Total train batch size (w. parallel, distributed & accumulation) = 128
INFO:transformers.trainer:  Gradient Accumulation steps = 1
INFO:transformers.trainer:  Total optimization steps = 1630


HBox(children=(IntProgress(value=0, description='Epoch', max=5, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=326, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.00234516449501178, "learning_rate": 4.996932515337424e-05, "epoch": 0.003067484662576687, "step": 1}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

{"eval_loss": 0.6520037651062012, "eval_acc": 0.6517412935323383, "epoch": 0.003067484662576687, "step": 1}


INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.08084941734816947, "learning_rate": 4e-05, "epoch": 1.0, "step": 326}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/parapragh_decomposition/checkpoint-326
INFO:transformers.configuration_utils:Configuration saved in ./models/parapragh_decomposition/checkpoint-326/config.json


{"eval_loss": 0.45084678530693056, "eval_acc": 0.9187396351575456, "epoch": 1.0, "step": 326}


INFO:transformers.modeling_utils:Model weights saved in ./models/parapragh_decomposition/checkpoint-326/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=326, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.0023290376606817996, "learning_rate": 3e-05, "epoch": 2.0, "step": 652}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/parapragh_decomposition/checkpoint-652
INFO:transformers.configuration_utils:Configuration saved in ./models/parapragh_decomposition/checkpoint-652/config.json


{"eval_loss": 0.5679295063018799, "eval_acc": 0.9137645107794361, "epoch": 2.0, "step": 652}


INFO:transformers.modeling_utils:Model weights saved in ./models/parapragh_decomposition/checkpoint-652/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=326, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.0008036018569584674, "learning_rate": 2e-05, "epoch": 3.0, "step": 978}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/parapragh_decomposition/checkpoint-978
INFO:transformers.configuration_utils:Configuration saved in ./models/parapragh_decomposition/checkpoint-978/config.json


{"eval_loss": 0.6377912640571595, "eval_acc": 0.9170812603648425, "epoch": 3.0, "step": 978}


INFO:transformers.modeling_utils:Model weights saved in ./models/parapragh_decomposition/checkpoint-978/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=326, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 0.00028497528941766933, "learning_rate": 1e-05, "epoch": 4.0, "step": 1304}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/parapragh_decomposition/checkpoint-1304
INFO:transformers.configuration_utils:Configuration saved in ./models/parapragh_decomposition/checkpoint-1304/config.json


{"eval_loss": 0.6359874367713928, "eval_acc": 0.9220563847429519, "epoch": 4.0, "step": 1304}


INFO:transformers.modeling_utils:Model weights saved in ./models/parapragh_decomposition/checkpoint-1304/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Iteration', max=326, style=ProgressStyle(description_width='i…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


{"loss": 9.977922872741966e-05, "learning_rate": 0.0, "epoch": 5.0, "step": 1630}


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…

INFO:transformers.trainer:Saving model checkpoint to ./models/parapragh_decomposition/checkpoint-1630
INFO:transformers.configuration_utils:Configuration saved in ./models/parapragh_decomposition/checkpoint-1630/config.json


{"eval_loss": 0.6318127870559692, "eval_acc": 0.9220563847429519, "epoch": 5.0, "step": 1630}


INFO:transformers.modeling_utils:Model weights saved in ./models/parapragh_decomposition/checkpoint-1630/pytorch_model.bin
INFO:transformers.trainer:

Training completed. Do not forget to share your model on huggingface.co/models =)







TrainOutput(global_step=1630, training_loss=0.01734239517579332)

In [15]:
result = trainer.evaluate()

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 603
INFO:transformers.trainer:  Batch size = 128


HBox(children=(IntProgress(value=0, description='Evaluation', max=5, style=ProgressStyle(description_width='in…


{"eval_loss": 0.6318127870559692, "eval_acc": 0.9220563847429519, "epoch": 5.0, "step": 1630}


## Find out Punchline

In [88]:
def collate_fn(batch):
    batch_inputs = list()
    batch_attention_masks = list()
    batch_token_type_ids = list()
    labels = list()
    max_size = max([len(ex['input_ids']) for ex in batch])
    for item in batch:
        batch_inputs += [pad_seq(item['input_ids'], max_size, 0)]
        batch_attention_masks += [pad_seq(item['attention_mask'], max_size, 0)]
        batch_token_type_ids += [pad_seq(item['token_type_ids'], max_size, 0)]
        labels.append(item['label'])

    return {"input_ids": torch.tensor(batch_inputs, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long),
            "token_type_ids": torch.tensor(batch_token_type_ids, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)
            }

In [185]:
def get_punchline(model, dataloader, compute_acc=False):
    sm = torch.nn.Softmax(dim=1)
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = {key:val.to("cuda:0") for key,val in data.items() if val is not None}
            tokens_tensors = data['input_ids']
            segments_tensors = data['token_type_ids']
            masks_tensors = data['attention_mask']
            labels = data['labels']
            
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
#             print(logits)
            prob = sm(logits)
#             print(prob)
            prob = list(map(lambda x:x[1], prob))
            pred = torch.tensor(prob)
            pred = torch.argmax(pred)
            
    return int(pred)

In [200]:
from tqdm import tqdm

In [216]:
def id2dict(_id):
    df_tmp = df_puns_train_dataAug[df_puns_train_dataAug.origin_id==_id].reset_index(drop=True)
    dataset_tmp = HumorDataset(df_tmp, tokenizer)
    if len(dataset_tmp)==0:
        print('batchsize is 0',_id)
    dataloader = DataLoader(dataset_tmp, batch_size=len(dataset_tmp), 
                             collate_fn=collate_fn)
    
    punchline_idx = get_punchline(model, dataloader, compute_acc=True)
    
    label = df_tmp.loc[punchline_idx, 'label']
    text_a = df_tmp.loc[punchline_idx, 'text_a']
    text_b = df_tmp.loc[punchline_idx, 'text_b']
    origin_id = df_tmp.loc[punchline_idx, 'origin_id']
    
    dataDict = {'label':label, 'text_a':text_a, 'text_b':text_b, 
                'origin_id':origin_id, 'punchline_idx':punchline_idx}
    return dataDict

In [222]:
%%time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

data_punchline = []

for _id in range(len(df_puns_train)):
    dataDict = id2dict(_id)
    data_punchline.append(dataDict)

CPU times: user 48.4 s, sys: 1.12 s, total: 49.6 s
Wall time: 49.5 s


In [223]:
df_puns_punchline = pd.DataFrame(data_punchline)
df_puns_punchline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3619 entries, 0 to 3618
Data columns (total 5 columns):
label            3619 non-null int64
origin_id        3619 non-null int64
punchline_idx    3619 non-null int64
text_a           3619 non-null object
text_b           3619 non-null object
dtypes: int64(3), object(2)
memory usage: 141.4+ KB


In [227]:
df_puns_punchline.head()

Unnamed: 0,label,origin_id,punchline_idx,text_a,text_b
0,0,0,7,I m hoping they ll come and see this,and say We have to have this
1,1,1,15,a man who cannot read the sign that warns peop...,illiterate
2,1,2,8,i fired the floor refinishers they simply coul...,their lacquer
3,1,3,6,an elevator makes ghosts happy because it lifts,the spirits
4,1,4,4,the first drinking establishment in alaska,was a polar bar


### Output augmented data

In [226]:
df_puns_punchline.to_csv('df_puns_punchline.csv', index=False)

In [229]:
df_puns_train_dataAug.to_csv('df_puns_pd.csv',index=False)