In [1]:
!pip3 install transformers==3.5.1

Collecting transformers==3.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 8.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 52.6MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 50.4MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB

# Import Package

In [2]:
import json
import re
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification

# specify GPU
device = torch.device("cuda")

# Data Cleaning

In [3]:
""" Remove special characters and numbers """

def clean_text(x):
    pattern = r'\([^()]*\)'
    text = re.sub(pattern, '', x)
    return text

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

# Data Loading

In [8]:
""" Load Dataset and Split train dataset into train, validation and test sets"""

train_text = []
train_labels = []
val_text = []
val_labels = []

with open('/content/data/train.jsonl', 'rb') as infile:
    for line in infile.readlines():
        entry = json.loads(line)
        text = clean_numbers(clean_text(entry['response'] + ' ' + ' '.join(entry['context'])))
        train_text.append(text)
        if entry['label'] == 'SARCASM':
            train_labels.append(1)
        else:
            train_labels.append(0)

train_text, val_text, train_labels, val_labels = train_test_split(train_text, train_labels,
                                                                    random_state=2020,
                                                                    test_size=0.1,
                                                                    stratify=train_labels)

test_text = []
test_labels = []
test_ids = []

with open('/content/data/test.jsonl', 'rb') as infile:
    for line in infile.readlines():
        entry = json.loads(line)
        text = clean_numbers(clean_text(entry['response'] + ' ' + ' '.join(entry['context'])))
        test_text.append(text)
        test_labels.append(1)
        test_ids.append(entry['id'])

# Import BERT Model and BERT Tokenizer

In [9]:
# Import BERT-base pretrained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False)

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




# Tokenization

In [10]:
max_seq_len = 40

# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text,
    max_length=max_seq_len,
    padding=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text,
    max_length=max_seq_len,
    padding=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text,
    max_length=max_seq_len,
    padding=True,
    truncation=True,
    return_token_type_ids=False
)

# Convert Integer Sequences to Tensors

In [11]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels)

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels)

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels)

# Create DataLoaders

In [12]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

# Initialize model, optimizer, loss function

In [13]:
# push the model to GPU
model = model.to(device)

# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

"""# Find Class Weights"""

from sklearn.utils.class_weight import compute_class_weight

# compute the class weights
class_wts = compute_class_weight('balanced', np.unique(train_labels), train_labels)

# convert class weights to tensor
weights = torch.tensor(class_wts, dtype=torch.float)
weights = weights.to(device)

# loss function
cross_entropy = nn.NLLLoss(weight=weights)

# number of training epochs
epochs = 5

# Function for Training


In [16]:
"""# Fine-Tune BERT"""

# function for training the model
def train():
    print("\nTraining Process Start...")
    model.train()

    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(train_dataloader):

        # progress update after every 40 batches.
        if step % 40 == 0 and not step == 0:
            print('\nBatch round {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch

        # clear previously calculated gradients
        model.zero_grad()

        # get model predictions for the current batch
        (loss, preds) = model(sent_id,
                              token_type_ids=None,
                              attention_mask=mask,
                              labels=labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds = preds.detach().cpu().numpy()

        # append the model predictions
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    # returns the loss and predictions
    return avg_loss, total_preds

# Function for Evaluating

In [17]:
# function for evaluating the model
def evaluate():
    print("\nEvaluating Process Start...")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(val_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Report progress.
            print('  Batch Time {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():

            # # model predictions
            # preds = model(sent_id, mask)
            #
            # # compute the validation loss between actual and predicted values
            # loss = cross_entropy(preds, labels)
            (loss, preds) = model(sent_id,
                                   token_type_ids=None,
                                   attention_mask=mask,
                                   labels=labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

# Model Taining

In [18]:
"""# Start Model Training"""

# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses = []
valid_losses = []
# for each epoch
for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    # train model
    train_loss, _ = train()

    # evaluate model
    valid_loss, _ = evaluate()

    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 5

Training Process Start...

Batch round    40  of    141.

Batch round    80  of    141.

Batch round   120  of    141.

Evaluating Process Start...

Training Loss: 0.564
Validation Loss: 0.513

 Epoch 2 / 5

Training Process Start...

Batch round    40  of    141.

Batch round    80  of    141.

Batch round   120  of    141.

Evaluating Process Start...

Training Loss: 0.434
Validation Loss: 0.474

 Epoch 3 / 5

Training Process Start...

Batch round    40  of    141.

Batch round    80  of    141.

Batch round   120  of    141.

Evaluating Process Start...

Training Loss: 0.279
Validation Loss: 0.636

 Epoch 4 / 5

Training Process Start...

Batch round    40  of    141.

Batch round    80  of    141.

Batch round   120  of    141.

Evaluating Process Start...

Training Loss: 0.143
Validation Loss: 0.642

 Epoch 5 / 5

Training Process Start...

Batch round    40  of    141.

Batch round    80  of    141.

Batch round   120  of    141.

Evaluating Process Start...

Trai

# Prediction

In [19]:
"""# Load Saved Model"""

# load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

"""# Get Predictions for Test Data"""

# get predictions for test data
print("Predication Start...")
with torch.no_grad():
    preds = model(test_seq.to(device), attention_mask=test_mask.to(device))
    preds = preds[0].detach().cpu().numpy()
print(preds)

# choose and save data following the requirements
preds = np.argmax(preds, axis=1)
with open('answer.txt', 'w+') as f:
    for i, id in enumerate(test_ids):
        if preds[i] == 1:
            f.write(id + ',' + 'SARCASM' + '\n')
        else:
            f.write(id + ',' + 'NOT_SARCASM' + '\n')

Predication Start...
[[-1.1962693   1.0824906 ]
 [-0.8169593   0.36274964]
 [-1.1163764   1.0659422 ]
 ...
 [-0.7134321   0.34902835]
 [ 2.5065098  -2.1981132 ]
 [ 1.736444   -1.7474476 ]]
