Configuring PyTorch to use GPU

In [None]:
from google.colab import drive
ROOT = '/content/drive'
drive.mount(ROOT)

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
from os.path import join  

# path to your project on Google Drive
MY_GOOGLE_DRIVE_PATH = 'MyDrive/Github/' 
# replace with your Github username 
GIT_USERNAME = "rohandas14" 
# definitely replace with your
GIT_TOKEN = "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"  
# Replace with your github repository in this case we want 
GIT_REPOSITORY = "SemEval-2022" 

PROJECT_PATH = join(ROOT, MY_GOOGLE_DRIVE_PATH)
print("PROJECT_PATH: ", PROJECT_PATH)   
!mkdir "{PROJECT_PATH}"    
GIT_PATH = "https://" + GIT_TOKEN + "@github.com/" + GIT_USERNAME + "/" + GIT_REPOSITORY + ".git"
print("GIT_PATH: ", GIT_PATH)

In [None]:
%cd "{PROJECT_PATH}"
!git clone "{GIT_PATH}"

In [None]:
!pip install transformers=='2.8.0'

In [None]:
%ls

In [None]:
import sys  
sys.path.insert(0, '/content/drive/MyDrive/Github/SemEval-2022/pcl')
%cd /content/drive/MyDrive/Github/SemEval-2022/pcl

In [None]:
from dont_patronize_me import DontPatronizeMe

# Initialize a dpm (Don't Patronize Me) object.
# It takes two areguments as input: 
# (1) Path to the directory containing the training set files, which is the root directory of this notebook.
# (2) Path to the test set, which will be released when the evaluation phase begins. In this example, 
# we use the dataset for Subtask 1, which the code will load without labels.
dpm = DontPatronizeMe('/content/drive/MyDrive/Github/SemEval-2022/dataset/', '.dontpatronizeme_pcl.tsv')

## Load Subtask 1 Data

In [None]:
# This method loads the subtask 1 data
dpm.load_task1()
# which we can then access as a dataframe
dpm.train_task1_df.head()

In [None]:
import pandas as pd
import re
import os
import math
import torch
from torch.nn import BCEWithLogitsLoss, NLLLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, XLNetTokenizer, XLNetModel, TFXLNetModel, XLNetLMHeadModel, XLNetConfig, XLNetForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm, trange

X = dpm.train_task1_df['text']
y = dpm.train_task1_df['label']


To feed our text to XLNet, it must be split into tokens, and then these tokens must be mapped to their index in the tokenizer vocabulary.

In [None]:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased", do_lower_case=True)

## Tokenization

XLNet requires specifically formatted inputs. For each tokenized input sentence, we need to create:

    input ids: a sequence of integers identifying each input token to its index number in the XLNet tokenizer vocabulary
    segment mask: (optional) a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences long. For one sentence inputs, this is simply a sequence of 0s. For two sentence inputs, there is a 0 for each token of the first sentence, followed by a 1 for each token of the second sentence
    attention mask: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens
    labels: a single value of 1 or 0. In our task 1 means “PCL” and 0 means “No PCL”

Although we can have variable length input sentences, XLNet does requires our input arrays to be the same size. We address this by first choosing a maximum sentence length, and then padding and truncating our inputs until every input sequence is of the same length.

To “pad” our inputs in this context means that if a sentence is shorter than the maximum sentence length, we simply add 0s to the end of the sequence until it is the maximum sentence length.

If a sentence is longer than the maximum sentence length, then we simply truncate the end of the sequence, discarding anything that does not fit into our maximum sentence length.

We pad and truncate our sequences so that they all become of length maxlen (“post” indicates that we want to pad and truncate at the end of the sequence, as opposed to the beginning) pad_sequences is a utility function that we’re borrowing from Keras. It simply handles the truncating and padding of Python lists.


In [None]:
def tokenize_inputs(text_list, tokenizer, num_embeddings=120):
    """
    Tokenizes the input text input into ids. Appends the appropriate special
    characters to the end of the text to denote end of sentence. Truncate or pad
    the appropriate sequence length.
    """
    # tokenize the text, then truncate sequence to the desired length minus 2 for
    # the 2 special characters
    tokenized_texts = list(map(lambda t: tokenizer.tokenize(t)[:num_embeddings-2], text_list))
    # convert tokenized text into numeric ids for the appropriate LM
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    # append special token "<s>" and </s> to end of sentence
    input_ids = [tokenizer.build_inputs_with_special_tokens(x) for x in input_ids]
    # pad sequences
    input_ids = pad_sequences(input_ids, maxlen=num_embeddings, dtype="long", truncating="post", padding="post")
    return input_ids

def create_attn_masks(input_ids):
    """
    Create attention masks to tell model whether attention should be applied to
    the input id tokens. Do not want to perform attention on padding tokens.
    """
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks

In [None]:
# Tokenize all of the sentences and map the tokens to their word IDs.

input_ids = tokenize_inputs(X, tokenizer, num_embeddings=120)
attention_masks = create_attn_masks(input_ids)

# Convert the lists into tensors.
input_ids = torch.from_numpy(input_ids)
attention_masks = torch.tensor(attention_masks)
y = torch.tensor(y)

# Print sentence 0, now as a list of IDs.
print('Original: ', X[1])
print('Token IDs:', input_ids[1])
print('Attention Masks:', attention_masks[1])

## Generating Data Splits

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, y)

# Create a 75-15-10 train-validation-test split.

# Calculate the number of samples to include in each set.
train_size = int(0.75 * len(dataset))
val_size = round(0.6*(len(dataset) - train_size))
test_size = len(dataset) - train_size - val_size

print(len(dataset))
print(train_size)
print(val_size)
print(test_size)

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])


We'll also create an iterator for our dataset using the torch DataLoader class. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory.

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. Batch size of 16 or 32.
batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

test_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# Load pretrained XLNet model

In [None]:
from transformers import XLNetForSequenceClassification
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = 2)
model = model.to(device)

# Setting up Hyperparameters


In [None]:
EPOCHS = 3
BATCH_SIZE = 32

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)