In [None]:
# install required packages
!pip install transformers torchtext

In [None]:
import os
import random

import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm

SEED = 77

# ensure reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# check if we have GPU
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

In [None]:
# If using colab
# from google.colab import drive
# drive.mount('/content/drive/')

### Specify Model Name and Path

In [None]:
pretrained_model = 'bert-base-uncased'
# pretrained_model = 'xlnet-base-cased'
# pretrained_model = 'roberta-base'
# pretrained_model = 'albert-base-v2'

In [None]:
# if using colab
# data_dir = '/content/drive/My Dbrive/EPFL/Machine Learning/ML_course/projects/project2/project_text_classification/Datasets/twitter-datasets'

# maybe you'll need to change this
data_dir = 'Datasets/twitter-datasets'

model_dir = os.path.join('seq', pretrained_model, 'model')
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

train_pos_dir = os.path.join(data_dir, 'train_pos_full.txt')
train_neg_dir = os.path.join(data_dir, 'train_neg_full.txt')
test_data_dir = os.path.join(data_dir, 'test_data.txt')
sample_submission_dir = os.path.join(data_dir, 'sample_submission.csv')

### Get the Model (**CHANGE THE IMPORTED MODEL HERE**)

In [None]:
# get the model we want
if pretrained_model == 'bert-base-uncased':
    from transformers import BertForSequenceClassification as SequenceClassificationModel
elif pretrained_model == 'xlnet-base-cased':
    from transformers import XLNetForSequenceClassification as SequenceClassificationModel
elif pretrained_model == 'roberta-base':
    from transformers import RobertaForSequenceClassification as SequenceClassificationModel
elif pretrained_model == 'albert-base-v2'
    from transformers import AlbertForSequenceClassification as SequenceClassificationModel

model = SequenceClassificationModel.from_pretrained(pretrained_model)

In [None]:
def count_parameters(model):
    """count total trainable parameters"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

### Transform Data

In [None]:
from transformers import AutoTokenizer

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

In [None]:
# check vocabulary size
print(tokenizer.vocab_size)

In [None]:
# these are all the special tokens
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

In [None]:
# check max length of the model input
max_input_length = tokenizer.max_model_input_sizes[pretrained_model]

print(max_input_length)

In [None]:
def tokenize_and_cut(sentence):
    """tokenize the sentence and cut it if it's too long"""
    tokens = tokenizer.tokenize(sentence)
    # - 2 is for cls and sep tokens
    tokens = tokens[:max_input_length - 2]
    return tokens

In [None]:
from torchtext import data

# Field handles the conversion to Tensor (tokenizing)
TEXT = data.Field(batch_first=True,
                  use_vocab=False,
                  tokenize=tokenize_and_cut,
                  preprocessing=tokenizer.convert_tokens_to_ids,
                  init_token=init_token_idx,
                  eos_token=eos_token_idx,
                  pad_token=pad_token_idx,
                  unk_token=unk_token_idx)

LABEL = data.LabelField(dtype=torch.long, use_vocab=False)

In [None]:
# read data
with open(train_pos_dir) as f:
    pos_lines = [line.rstrip('\n') for line in f]
with open(train_neg_dir) as f:
    neg_lines = [line.rstrip('\n') for line in f]
with open(test_data_dir) as f:
    test_lines = [line.rstrip('\n')[line.rstrip('\n').find(',') + 1:] for line in f]
    
# load data into dataframe
pos_df = pd.DataFrame(pos_lines, columns=['text'])
pos_df['label'] = 1
neg_df = pd.DataFrame(neg_lines, columns=['text'])
neg_df['label'] = 0
test_df = pd.DataFrame(test_lines, columns=['text'])
# because the model input required some label
# we won't use this though
test_df['label'] = 1

df = pd.concat([pos_df, neg_df], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
class DataFrameDataset(data.Dataset):
    def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for i, row in tqdm(df.iterrows(), total=df.shape[0]):
            label = row.label
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)
    @staticmethod
    def sort_key(ex):
        return len(ex.text)
    @classmethod
    def splits(cls, text_field, label_field, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        if train_df is not None:
            train_data = cls(train_df.copy(), text_field, label_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), text_field, label_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), text_field, label_field, is_test=True, **kwargs)
        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [None]:
train_size = df.shape[0]
val_per = 0.05
val_size = int(val_per * train_size)
# transform DataFrame into torchtext Dataset
train_data, valid_data, test_data = DataFrameDataset.splits(
text_field=TEXT, label_field=LABEL, train_df=df[:-val_size], val_df=df[-val_size:], test_df=test_df)

# use the following two lines for small scale testing
# train_data, valid_data, test_data = DataFrameDataset.splits(
# text_field=TEXT, label_field=LABEL, train_df=df[:100], val_df=df[100:200], test_df=test_df[:100])

# use the following two lines only for final testing
# train_data, valid_data, test_data = DataFrameDataset.splits(
# text_field=TEXT, label_field=LABEL, train_df=df[:1], val_df=df[1:2], test_df=test_df)

In [None]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of validation examples: {len(test_data)}")

### Training Preparation

In [None]:
BATCH_SIZE = 32

# get gpu if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# get torchtext Iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size=BATCH_SIZE,
    device=device)

In [None]:
import torch.optim as optim
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)
model = model.to(device)

In [None]:
def binary_accuracy(preds, y):
    """returns accuracy per batch"""

    # round predictions to the closest integer
    softmax = torch.softmax(preds, dim=1)
    final_preds = torch.max(softmax, 1, keepdim=True)[1].squeeze(1)
    # convert into float for division 
    correct = (final_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer):
    """training procedure"""
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    batch_ctn = 1
    pbar = tqdm(iterator)
    
    for batch in pbar:
        optimizer.zero_grad()
        loss, logits = model(batch.text, labels=batch.label)[:2]
        acc = binary_accuracy(logits, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        pbar.set_description(f'loss: {epoch_loss / batch_ctn:.3f} | accu: {epoch_acc / batch_ctn * 100:.2f}%')
        batch_ctn += 1
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator):
    """evaluating procedure (we don't need gradient)"""
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    batch_ctn = 1
    pbar = tqdm(iterator)
    
    with torch.no_grad():
        for batch in pbar:
            loss, logits = model(batch.text, labels=batch.label)[:2]
            acc = binary_accuracy(logits, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
            pbar.set_description(f'loss: {epoch_loss / batch_ctn:.3f} | accu: {epoch_acc / batch_ctn * 100:.2f}%')
            batch_ctn += 1
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    """record time of a epoch"""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# load model to resume training (0: don't resume)
resumed_epoch = 0
# state = torch.load(model_dir + f'{pretrained_model}-e{resumed_epoch:02}-state.pt')
# model.load_state_dict(state['state_dict'])
# optimizer.load_state_dict(state['optimizer'])

### Start Training

In [None]:
N_EPOCHS = 5

valid_losses = []
valid_accues = []

for epoch in range(N_EPOCHS):
    if resumed_epoch:
        epoch += (resumed_epoch - 1)
    
    start_time = time.time()
    
    train_loss, train_accu = train(model, train_iterator, optimizer)
    valid_loss, valid_accu = evaluate(model, valid_iterator)
    
    valid_losses.append(valid_loss)
    valid_accues.append(valid_accu)
        
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    torch.save(model.state_dict(), os.path.join(model_dir, f'{pretrained_model}-e{epoch + 1:02}-model.pt'))
    state = {
        'epoch': epoch,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(state, os.path.join(model_dir, f'{pretrained_model}-e{epoch + 1:02}-state.pt'))
    
    print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Accu: {train_accu * 100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Accu: {valid_accu * 100:.2f}%')

print()
print(f'Best Val. loss epoch: {np.argmin(valid_losses) + 1:02} | Val. loss: {min(valid_losses):.3f}')
print(f'Best Val. accu epoch: {np.argmax(valid_accues) + 1:02} | Val. accu: {max(valid_accues) * 100:.2f}%')

### Testing

In [None]:
# pick an epoch
selected_epoch = 0
model.load_state_dict(torch.load(os.path.join(model_dir, f'{pretrained_model}-e{selected_epoch:02}-model.pt'), map_location=device))
model = model.eval()

In [None]:
def test(model, iterator):
    """testing procedure (we don't need gradient)"""
    predictions = []
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(iterator):
            _, logits = model(batch.text, labels=batch.label)[:2]
            softmax = torch.softmax(logits, dim=1)
            final_preds = torch.max(softmax, 1, keepdim=True)[1].squeeze(1)
            predictions.extend(final_preds.tolist())
        
    return predictions

In [None]:
# get testing data iterator
TEST_BATCH_SIZE = 32
test_iterator = data.Iterator(test_data, batch_size=TEST_BATCH_SIZE, device=device, shuffle=False, sort=False, train=False)

In [None]:
# get predictions of test data
predictions = test(model, test_iterator)

In [None]:
# map predictions to match the original
label_map = {0: -1, 1: 1}
corrected_predictions = list(map(lambda x: label_map[x], predictions))

In [None]:
# load data into dataframe
submission = pd.read_csv(sample_submission_dir)
submission.Prediction = corrected_predictions
submission.to_csv(os.path.join('seq', pretrained_model, f'{pretrained_model}-e{selected_epoch:02}.csv'), index=False)