## Humor detection challenge from Kaggle

In [None]:
!wget https://github.com/ravi-ilango/acm-dec-2020-nlp/blob/main/lab5/humor_data.zip?raw=true -O humor_data.zip

!unzip humor_data.zip

In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import os 

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split


## LAB


- Use the disaster_detection_bert as a reference 

- Load input data, tokenize and prepare tensors for training

- Instantiate a XLNet pretrained model

- Predict humor for few sentences

Note: Use 10K data from dataset, 
      train/validation split of 70/30,
      batch_size 32, 
      train for 3 epochs

### Check sample code from huggingface illustrating the use of XLNet
(https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforsequenceclassification)

In [None]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased')

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)

loss = outputs.loss
logits = outputs.logits

### Load data


In [None]:
#
# text are stored in the variable text
# humor or not labels are stored in the variable humor
#
df = pd.read_csv('./humor_data/dataset.csv')

df = df[:10000]

text = df.text.values
labels = df['humor'].apply(lambda x: 1 if x else 0).values

print(text.shape)

### Plot target distribution

In [None]:

%matplotlib inline

plt.hist(labels)
plt.xlabel('humor')
plt.ylabel('count')
plt.title('target distribution')
plt.xticks(np.arange(len(np.unique(labels))));


### Prepare Data


In [None]:
# Tokenize with XLNet tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)
padded_sequences = tokenizer(list(text), padding=True)
print (f"tokenized inputs {padded_sequences['input_ids'][0]}")


In [None]:
padded_sequences.keys()

### Split into training and validation datasets

In [None]:
# include <your code>



### Convert to tensors

In [None]:
# include <your code>


### Prepare training data generators

In [None]:
# include <your code>



### Load pretrained XLNet model, setup optimizer


In [None]:
# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device.type == 'cuda':
    print (torch.cuda.get_device_name(0))

# include <your code>


In [None]:
model_path = './xlnet_humor_detection_state_dict.pth'

if os.path.exists(model_path):
    print ("Loading weights from saved model...")
    model.load_state_dict(torch.load(model_path, map_location=device))

### Model Train function 

In [None]:
# include <your code>


### Model Evaluate function

In [None]:
# include <your code>



### Train the model

In [None]:
from tqdm import trange

# include <your code>



### Predict

In [None]:
if os.path.exists(model_path):
    print ("Loading weights from saved model...")
    model.load_state_dict(torch.load(model_path, map_location=device))

model.to('cpu')
model.eval()

def predict(model, sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    output = model(**inputs)
    logits = output.logits
    logits = logits.detach().cpu().numpy()

    pred = np.argmax(logits, axis=1)[0]
    return 'Humor' if pred == 1 else 'Not a humor'

In [None]:
#Humor or not question
predict(model, "Where do eskimos keep their money? in snowbanks.")


In [None]:
#Humor or not question
predict(model, "How many optometrists does it take to screw in a lightbulb? one... or two? one... or two?")


In [None]:
#Humor or not question
predict(model, "Mcdonald's will officially kick off all-day breakfast on october 6")

In [None]:
#Humor or not question
predict(model, "The journey to labor land: how women can reclaim their birth rights")