In [1]:
!pip install transformers==3

Collecting transformers==3
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |▍                               | 10kB 17.8MB/s eta 0:00:01[K     |▉                               | 20kB 17.1MB/s eta 0:00:01[K     |█▎                              | 30kB 10.9MB/s eta 0:00:01[K     |█▊                              | 40kB 7.5MB/s eta 0:00:01[K     |██▏                             | 51kB 8.0MB/s eta 0:00:01[K     |██▋                             | 61kB 8.3MB/s eta 0:00:01[K     |███                             | 71kB 9.4MB/s eta 0:00:01[K     |███▌                            | 81kB 9.0MB/s eta 0:00:01[K     |████                            | 92kB 7.9MB/s eta 0:00:01[K     |████▍                           | 102kB 8.6MB/s eta 0:00:01[K     |████▊                           | 112kB 8.6MB/s eta 0:00:01[K     |█████▏                          | 122kB

In [2]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetModel, XLNetTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
# Setting up GPU

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

print (device)

cuda


In [4]:
df = pd.read_csv('Sarcasm_Final_Dataset_Benchmark - Sheet1.csv')
df.head()

Unnamed: 0,article_link,Text,Label
0,https://www.huffingtonpost.com/entry/trump-par...,donald trump insists he has the 'complete powe...,0
1,https://entertainment.theonion.com/woman-who-a...,woman who admits to having watched golden glob...,1
2,https://entertainment.theonion.com/eva-longori...,eva longoria tans self out of visible spectrum,1
3,https://www.theonion.com/horrified-pope-calls-...,horrified pope calls philadelphia humanity's g...,1
4,https://www.huffingtonpost.com/entry/the-enigm...,the enigmatic art of josef koudelka,0


In [5]:
df.describe()

Unnamed: 0,Label
count,3999.0
mean,0.574894
std,0.494421
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [6]:
#Preparing Dataset and Dataloader

# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', truncation=True, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




In [7]:
class TweetData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Text
        self.targets = self.data.Label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [8]:
train_size = 0.8
train_data=df.sample(frac=train_size,random_state=200)
test_data=df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = TweetData(train_data, tokenizer, MAX_LEN)
testing_set = TweetData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (3999, 3)
TRAIN Dataset: (3199, 3)
TEST Dataset: (800, 3)


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
#Base XLNET model
class XLNETClass(torch.nn.Module):
    def __init__(self):
        super(XLNETClass, self).__init__()
        self.l1 = XLNetModel.from_pretrained("xlnet-base-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [11]:
model = XLNETClass()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




XLNETClass(
  (l1): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

In [12]:
#Finetuning XLNETClass model

# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [13]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [14]:
# Defining the training function on the 80% of the dataset

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [15]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Training Loss per 5000 steps: 1.6831034421920776
Training Accuracy per 5000 steps: 37.5


400it [02:27,  2.70it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 0: 60.05001562988434
Training Loss Epoch: 0.7248537065088749
Training Accuracy Epoch: 60.05001562988434
Training Loss per 5000 steps: 0.6286426782608032
Training Accuracy per 5000 steps: 62.5


400it [02:27,  2.70it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 1: 78.99343544857769
Training Loss Epoch: 0.4596515394747257
Training Accuracy Epoch: 78.99343544857769
Training Loss per 5000 steps: 0.19663070142269135
Training Accuracy per 5000 steps: 87.5


400it [02:28,  2.70it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 2: 86.2144420131291
Training Loss Epoch: 0.3258100157778244
Training Accuracy Epoch: 86.2144420131291
Training Loss per 5000 steps: 0.35777127742767334
Training Accuracy per 5000 steps: 75.0


400it [02:27,  2.70it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 3: 91.37230384495155
Training Loss Epoch: 0.21705983938998544
Training Accuracy Epoch: 91.37230384495155
Training Loss per 5000 steps: 0.0916532576084137
Training Accuracy per 5000 steps: 100.0


400it [02:28,  2.70it/s]

The Total Accuracy for Epoch 4: 94.46702094404502
Training Loss Epoch: 0.14154517507879064
Training Accuracy Epoch: 94.46702094404502





In [16]:
#Testing the trained model

def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [17]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

0it [00:00, ?it/s]

Validation Loss per 100 steps: 0.0007678581168875098
Validation Accuracy per 100 steps: 100.0


200it [00:12, 15.56it/s]

Validation Loss Epoch: 0.3889051413959532
Validation Accuracy Epoch: 88.625
Accuracy on test data = 88.62%



