In [2]:
import sys
# Check if we are in Google Colab
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
  from google.colab import drive
  # Mounting the Gdrive
  drive.mount('/content/drive', force_remount=True)
  GDRIVE_ROOT_DIR = '/content/drive/MyDrive'
  !pip install transformers
  !pip install sentencepiece

Mounted at /content/drive


In [17]:
import os
import time
import random
import argparse
from tqdm.notebook import tqdm

import torch
import numpy as np
import pandas as pd
# from sklearn.model_selection import train_test_split
# from keras.preprocessing.sequence import pad_sequences

from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification

In [4]:
args_dict = {
    'data_path': 'KESKIA/data/limjiayi_hateful_memes_expanded_test.tsv',
    'out_dir': 'KESKIA/models/',
    'log_dir': 'KESKIA/runs/',
    'model_name': 'camembert_mad_test',
    'train_ratio': 0.8,
    'batch_size': 32,
    'num_labels': 2,
    'max_len': 64,
    'epochs': 4,
    'lr': 2e-5,
    'eps': 1e-8,
    'device_id': 0
}

args = argparse.Namespace(**args_dict)

In [5]:
# Check for GPU
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(args.device_id))
    device = torch.device(args.device_id)
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
print(f'Device ID -> {device}')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4
Device ID -> cuda:0


In [6]:
# Fix Random Seed
SEED_VAL = 999
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed_all(SEED_VAL)

In [7]:
# Uploading data
if IN_COLAB:
  data_path = os.path.join(GDRIVE_ROOT_DIR, args.data_path)
else:
  data_path = args.data_path

data = pd.read_csv(
  data_path,
  sep='\t',
  on_bad_lines='skip'
)
print(f'Size of dataset: {len(data)}')

text_list = data['text'].values[:]
label_list = data['label'].values[:]

print(f'Maximum length of text: {max([len(t) for t in text_list])}')

Size of dataset: 3000
Maximum length of text: 216


In [24]:
# Initialize Tensorboard Writer
tensorboard_dir = os.path.join(GDRIVE_ROOT_DIR, args.log_dir, args.model_name)
os.makedirs(tensorboard_dir, exist_ok=True)
writer= SummaryWriter(log_dir=tensorboard_dir)
print(f'Writer directory: {tensorboard_dir}')

Writer directory: /content/drive/MyDrive/KESKIA/runs/camembert_mad_test


In [9]:
# Initializing tokenizer
MODEL_NAME = 'camembert/camembert-large'
tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME)
model = CamembertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME, # Use the 12-layer CamemBERT
    num_labels=args.num_labels, # Binary classification.
    output_attentions=False, # Whether the model returns attentions weights.
    output_hidden_states=False, # Whether the model returns all hidden-states.
)
model.to(device)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert/camembert-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-23): 24 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=1024, out_features=10

In [10]:
# In previous versions we made the padding and attention_masks manually

# Tokenize all of the texts and map the tokens to thier word IDs.
# input_ids = []
# for text in text_list:
#     encoded_text = tokenizer.encode(
#         text,                      # Sentence to encode.
#         add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#     )
#     # Add the encoded text to the list.
#     input_ids.append(encoded_text)

# # Print text 0, now as a list of IDs.
# print('Original: ', data['text'][0])
# print('Token IDs:', input_ids[0])
# print('Max token sequence length: ', max([len(encoded_text) for encoded_text in input_ids]))

# print(f'Padding token: "{tokenizer.pad_token}", ID: {tokenizer.pad_token_id}')
# input_ids = pad_sequences(
#     input_ids,
#     maxlen=args.max_len,
#     dtype="long",
#     value=tokenizer.pad_token_id,
#     truncating="post",
#     padding="post"
# )

# # Create attention masks
# # Note: tokenizer.pad_token_id = 1
# attention_masks = []
# for encoded_text in input_ids:
#     # Create the attention mask.
#     #   - If a token ID is 1, then it's padding, set the mask to False => 0.
#     #   - If a token ID is != 1, then it's a real token, set the mask to True => 1.
#     att_mask = np.asarray([int(token_id != tokenizer.pad_token_id) for token_id in encoded_text])
#     # Store the attention mask for this sentence.
#     attention_masks.append(att_mask)
# attention_masks = np.asarray(attention_masks)

# # Convert all inputs and labels into torch tensors
# input_ids = torch.from_numpy(input_ids)
# attention_masks = torch.from_numpy(attention_masks)
# labels = torch.from_numpy(label_list)

Original:  branlettes vendues séparément
Token IDs: [5, 30266, 2571, 18112, 20013, 6]
Max token sequence length:  49
Padding token: "<pad>", ID: 1


In [11]:
input_ids = []
attention_masks = []
for text in text_list:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
        str(text),                  # Sentence to encode.
        add_special_tokens=True,    # Add '[CLS]' and '[SEP]'
        max_length=args.max_len,    # Set maximum length of sequence
        padding='max_length',       # Pad to max length
        truncation=True,            # Truncate to max length
        return_attention_mask=True, # Construct attn. masks.
        return_tensors='pt',        # Return pytorch tensors.
    )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

In [12]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(label_list)

# Print sentence 0, now as a list of IDs.
print('Original: ', text_list[0])
print('Token IDs:', input_ids[0])

Original:  branlettes vendues séparément
Token IDs: tensor([    5, 30266,  2571, 18112, 20013,     6,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1])


In [11]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Calculate the number of samples to include in each set.
train_size = int(args.train_ratio * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

  240 training samples
   60 validation samples


In [12]:
# Initializing the DataLoaders
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

train_dataloader = DataLoader(
    train_dataset,  # The training samples.
    sampler=RandomSampler(train_dataset), # Select batches randomly
    batch_size=args.batch_size # Trains with this batch size.
)

# For validation the order doesn't matter, so we'll just read them sequentially.
val_dataloader = DataLoader(
    val_dataset, # The validation samples.
    sampler=SequentialSampler(val_dataset), # Pull out batches sequentially.
    batch_size=args.batch_size # Evaluate with this batch size.
)

In [13]:
# Optimizer & Scheduler
# For the purposes of fine-tuning, the authors recommend choosing from the following values:
# - Batch size: 16, 32  (We chose 32 when creating our DataLoaders).
# - Learning rate (Adam): 5e-5, 3e-5, 2e-5  (We'll use 2e-5).
# - Number of epochs: 2, 3, 4  (We'll use 4).
# - epsilon parameter `eps = 1e-8` is "a very small number to prevent any division by zero in the implementation

optimizer = AdamW(model.parameters(), lr=args.lr, eps=args.lr)

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0, # Default value in run_glue.py
    num_training_steps = len(train_dataloader) * args.epochs
)

In [14]:
def train(model, train_dataloader, optimizer, scheduler, device, desc) -> float:
  model.train()
  # Tracking variables
  running_loss = 0.0
  progress_bar = tqdm(train_dataloader,desc=desc)
  for step, batch in enumerate(progress_bar):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Always clear any previously calculated gradients before performing a
    # backward pass. PyTorch doesn't do this automatically because
    # accumulating the gradients is "convenient while training RNNs".
    model.zero_grad()

    # Forward Pass
    outputs = model(
      b_input_ids,
      token_type_ids=None,
      attention_mask=b_input_mask,
      labels=b_labels
    )
    # Extract the loss from the output
    loss = outputs[0]
    # Accumulate the training loss over all of the batches
    running_loss += loss.item()
    # Backward Pass to calculate the gradients.
    loss.backward()
    # Clip the norm of the gradients to prevent the "exploding gradients"
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    # Update parameters and take a step using the computed gradient.
    optimizer.step()
    # Update the learning rate.
    scheduler.step()

  # Calculate the average loss over the training data.
  mean_train_loss = running_loss/step
  return mean_train_loss

In [15]:
def eval(model, val_dataloader, device, desc) -> float:
  model.eval()
  # Tracking variables
  running_accuracy = 0.0
  progress_bar = tqdm(val_dataloader,desc=desc)
  # Evaluate data for one epoch
  for step, batch in enumerate(progress_bar):
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch

      # Telling the model not to compute or store gradients
      with torch.no_grad():
        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        outputs = model(
          b_input_ids,
          token_type_ids=None,
          attention_mask=b_input_mask
        )

      # Get the "logits" output by the model.
      logits = outputs[0]
      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      # Calculate the accuracy for this batch of test sentences.
      pred_flat = np.argmax(logits, axis=1).flatten()
      labels_flat = label_ids.flatten()
      val_accuracy = np.sum(pred_flat == labels_flat)/len(labels_flat)
      # Accumulate the total accuracy.
      running_accuracy += val_accuracy

  # Report the final accuracy for this validation run.
  mean_val_accuracy = running_accuracy/step
  return mean_val_accuracy

In [None]:
for epoch in range(0, args.epochs):
  tic = time.time()
  mean_train_loss = train(
      model=model,
      train_dataloader=train_dataloader,
      optimizer=optimizer,
      scheduler=scheduler,
      device=device,
      desc=f'[{str(device).upper()}] Train Epoch {epoch+1}/{args.epochs}'
  )

  mean_val_accuracy = eval(
      model=model,
      val_dataloader=val_dataloader,
      device=device,
      desc=f'[{str(device).upper()}] Validate Epoch {epoch+1}/{args.epochs}'
  )
  tt = (time.time() - tic)/60
  tqdm.write(
      f'[Epoch {epoch+1}/{args.epochs}] ' +
      f'Train Loss: {mean_train_loss:.3E} ' +
      f'| Accuracy: {mean_val_accuracy:.3E}'+
      f'| Time taken: {tt:.2f} mins'
  )
  writer.add_scalar('Loss/train', mean_train_loss, epoch)
  writer.add_scalar('Accuracy', mean_val_accuracy, epoch)

In [None]:
# Saving model
if IN_COLAB:
  output_dir = os.path.join(GDRIVE_ROOT_DIR, args.out_dir, args.model_name)
else:
  output_dir = os.path.join(args.out_dir, args.model_name)

# Create output directory if needed
os.makedirs(output_dir, exist_ok=True)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# Take care of distributed/parallel training
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
torch.save(args, os.path.join(output_dir, 'training_args.bin'))