In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import random, copy, spacy, numpy as np, pandas as pd, os, pickle, time, datetime
import torchtext, torch, torch.nn as nn, torch.optim, torch.nn.functional as F, torch.autograd as autograd, transformers


from sklearn.model_selection import train_test_split
from torchtext import data, datasets
from torchtext.vocab import FastText
from torch.optim import Adam, Adamax, AdamW, Adadelta, Adagrad
from torch.optim.lr_scheduler import StepLR
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler,SequentialSampler

from transformers import (
    BertTokenizer, 
    BertModel, 
    BertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
)

from spacy.tokenizer import Tokenizer

# Utilities Functions and Classes

### Preprocessing function

In [None]:
#Preprocessing function
def data_preprocess(x, y, ref):
    """
    Preprocessing function That will, for each title, look for its references if it exists and concatenate them into
    a single text
    :param x: Text data which represent the titles
    :param y: represente the either the train data or the test data ids
    :param ref: Represente the reference for a given title ids
    :return: A new dataframe with with a new column called concat_data which will used for the classification
    """
    def ref_func(i, ref_df):
        """
        Thin encapsulated function task is to look for all the references for a given title id and concat them into a
        single text
        :param i: Text id or title id
        :param ref_df: The refertences dataframe
        :return: Pandas dataframe with a new colum called new_var which the references concatenated into a single text
        for a given title
        """
        ref_list = list(ref_df[ref_df.id_x == i]["title"])
        return " ".join(ref_list)
    df = y.merge(x, left_on="id", right_on="id", how="inner")   # Merge the label and text into on df
    ref_merge = ref.merge(x, left_on="id.1", right_on="id", how="left")
    df["new_var"] = df.id.apply(lambda i: ref_func(i, ref_merge))
    df["AllCombined"] = df["title"] + " " + df["new_var"]
    df.drop("new_var", axis=1, inplace=True)
    return df

In [None]:
def generate_n_grams(text):
    """
    Function to create n_grams for a given text
    :param text: Text data (tokens)
    :param n: the n_grams parameters
    :return: A liste of n_grams tokens
    """
    n_grams = set(zip(*[text[i:] for i in range(2)]))
    for gram in n_grams:
        text.append(' '.join(gram))
    return text

In [None]:
nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)

def spacy_tokenize(x):
    return [tok.text for tok in tokenizer(x)]

In [None]:
def format_time(elapsed):
    """
    Takes a time in seconds and returns a string hh:mm:ss
    :param elapsed:
    :return:
    """
    # Round to the nearest second.
    elapsed_rounded = int(round(elapsed))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

### CNN Model class definition

In [None]:
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim,
                 dropout):
        """
        Convolution Neural Network Class for Text Classification
        :param vocab_size: Vocabulary size
        :param embedding_dim: The embedding dimension
        :param n_filters: The number of filters to use
        :param filter_sizes: The filters sizes to use. This define the n-gram to use
        :param output_dim: The number of class to clssify
        :param dropout: The dropout to use for regularisation
        """
        super(CNNClassifier, self).__init__()
        self.n_filters = n_filters
        self.filter_sizes = filter_sizes
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=self.n_filters,
                      kernel_size=[fs, embedding_dim], padding=(fs-1, 0))
            for fs in self.filter_sizes
        ])
        self.relu = nn.ReLU()
        self.fc = nn.Linear(len(self.filter_sizes) * self.n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        """
        The forward pass function
        :param text: The text data numericalised. The text Data is coming like [sent_len, batch_size]
        :return: The logits corresponding to all the classes
        """
        embedded = self.embedding(text)  # --> [batch_size, sent_len, emb_dim]
        embedded = embedded.unsqueeze(1)  # --> [batch size, 1, sent len, emb dim]
        # embedded = self.dropout(embedded)
        conved = [
            self.relu(conv(embedded)).squeeze(3) for conv in self.convs
        ]  # --> [batch size, n_filters, sent len - filter_sizes[n]]
        pooled = [
            F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved
        ]  # --> [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1))  # --> [batch size, n_filters * len(filter_sizes)]
        return self.fc(cat)

### LSTM Model Class definition

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, num_layers, embedding_dim, hidden_dim, num_label, padding_idx, device,
                 dropout=0.1, bidirectional=True):
        """
        LSTM Model class for text classification with multilayer and bidirectionnal implementation
        :param vocab_size: Vocabulary size (int)
        :param num_layers: Number of layers (int)
        :param embedding_dim: Embedding dimension (int)
        :param hidden_dim: Hidden dimension (int)
        :param num_label: Number of the label class (Label should start from 0)
        :param padding_idx: The padding indexes (int)
        :param dropout: The dropout for regularisation (Float)
        :param bidirectional: Bool
        :param device: Device type to use (GPU or CPU)
        """
        super(LSTMClassifier, self).__init__()
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_label = num_label
        self.padding_idx = padding_idx
        self.bidirectional = bidirectional
        self.device = device

        # Layers initialisation
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx)

        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=self.bidirectional,
            dropout=0 if num_layers < 2 else dropout,
            batch_first=True)  # LSTM layer

        self.fc = nn.Linear(
            self.hidden_dim * 2 if self.bidirectional else self.hidden_dim,
            self.num_label
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        """
        Forward pass function
        :param text: The text data
        :param text_lengths: The length of the text
        :return: Logits (Tensor float)
        """
        # h0, c0 = self.zero_state(text.size(0))  # Initializes the zero state cell
        res = self.embedding(text)  # Embedding layer
        res = self.dropout(res)  # dropout layer after the embedding
        res = nn.utils.rnn.pack_padded_sequence(res, text_lengths, batch_first=True)  # Pack the sequence
        _, (hn, _) = self.lstm(res)  # We only need the last hidden state

        if self.bidirectional:
            hidden = self.dropout(torch.cat((hn[-2, :, :], hn[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hn[-1, :, :])

        res = self.fc(hidden)  # The fully connected layer
        return res



### Training function

In [None]:
def train_cnn(model, iterator, optimizer, criterion, scheduler=None):
    total_loss,total_correct,total_prediction = 0.0, 0.0, 0.0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        # text,text_length = batch.AllCombined
        logits = model(batch.AllCombined.cuda())
        predictions = torch.max(logits, dim=-1)[1]
        loss = criterion(logits, batch.label.cuda())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        
        total_loss += loss.item()
        total_correct += torch.eq(predictions, batch.label.cuda()).sum().item()
        total_prediction += batch.label.size(0)
    return total_loss / len(iterator),total_correct / total_prediction

In [None]:
def train_lstm(model, iterator, optimizer, criterion, scheduler=None):
    total_loss, total_correct, total_prediction = 0.0, 0.0, 0.0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()

        text, text_length = batch.AllCombined

        logits = model(text.cuda(),text_length.cuda())
        predictions = torch.max(logits,dim=-1)[1]
        loss = criterion(logits,batch.label.cuda())

        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        total_loss += loss.item()
        total_correct += torch.eq(predictions,batch.label.cuda()).sum().item()
        total_prediction += batch.label.size(0)
    return total_loss / len(iterator),total_correct / total_prediction

### Evaluation function

In [None]:
def evaluate_cnn(model, iterator, criterion):
    total_loss, total_correct, total_prediction = 0.0, 0.0, 0.0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            logits = model(batch.AllCombined.cuda())
            predictions = torch.max(logits, dim=-1)[1]
            loss = criterion(logits, batch.label.cuda())

            total_loss += loss.item()
            total_correct += torch.eq(predictions, batch.label.cuda()).sum().item()
            total_prediction += batch.label.size(0)
    return total_loss / len(iterator),total_correct / total_prediction

In [None]:
def evaluate_lstm(model, iterator, criterion):
    total_loss, total_correct, total_prediction = 0.0, 0.0, 0.0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, text_length = batch.AllCombined

            logits = model(text.cuda(), text_length.cuda())
            predictions = torch.max(logits, dim=-1)[1]
            loss = criterion(logits, batch.label.cuda())

            total_loss += loss.item()
            total_correct += torch.eq(predictions, batch.label.cuda()).sum().item()
            total_prediction += batch.label.size(0)
    return total_loss / len(iterator), total_correct / total_prediction

# Model training

In [None]:
torch.cuda.empty_cache()
SEED = 42
# random.seed(SEED)
# np.random.seed(SEED)
# torch.manual_seed(SEED)
# torch.cuda.manual_seed_all(SEED)

In [None]:
# Read in data
reference = pd.read_csv("data/reference.csv")
sample = pd.read_csv("data/sample.csv")
test = pd.read_csv("data/test.csv")
text = pd.read_csv("data/text.csv")
label = pd.read_csv("data/train.csv")

In [None]:
train_df = data_preprocess(x=text, y=label, ref=reference)
train_df.to_csv("train_df.csv", index=None)
train_df = pd.read_csv("train_df.csv")

In [None]:
nlp = spacy.load("en_core_web_lg")
test_preprocessed = data_preprocess(x=text,y=test,ref=reference)
test_preprocessed.to_csv("test_prepared.csv",index=None)
test_preprocessed.head()

In [None]:
train_df.head()

# CNN Model

In [None]:
# FastText vectors
fast_text_vec = FastText()

In [None]:
TEXTcnn = data.Field(tokenize = spacy_tokenize, batch_first = True,sequential=True,lower=True)
LABELcnn = data.LabelField(use_vocab=False,sequential=False)

fields_cnn = [("id",None),("label",LABELcnn),("title",None),("AllCombined",TEXTcnn)]

In [None]:
# We import the train data file. Since it's in a csv file, we will use Tabular Dataset
cnnDataset = torchtext.data.TabularDataset(
    path="train_df.csv",
    format="CSV",
    fields=fields_cnn,
    skip_header=True
)

In [None]:
X_train_cnn, X_val_cnn = cnnDataset.split(
    split_ratio=0.2,
    random_state=random.seed(SEED)
)

In [None]:
TEXTcnn.build_vocab(
    X_train_cnn,
    vectors = fast_text_vec,
    unk_init = torch.Tensor.normal_
)

In [None]:
INPUT_DIM = len(TEXTcnn.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 256
FILTER_SIZES = [1,2,3,4,5]
OUTPUT_DIM = 5
DROPOUT = 0.3
DEVICE = torch.device('cuda')
BATCH_SIZE = 5
EPOCHS = 25
LR = 1e-4

In [None]:
ModelCNN = CNNClassifier(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

In [None]:
ModelCNN.__class__.__name__

In [None]:
X_train_iter_cnn, X_val_iter_cnn = data.BucketIterator.splits(
    (X_train_cnn, X_val_cnn),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.AllCombined),
    sort_within_batch=True,
    device = DEVICE
)

In [None]:
# Transfer the FastText pretrained embedding
UNK_IDX_CNN = TEXTcnn.vocab.stoi[TEXTcnn.unk_token]
PAD_IDX_CNN = TEXTcnn.vocab.stoi[TEXTcnn.pad_token]

pretrained_embeddings = TEXTcnn.vocab.vectors
ModelCNN.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX_CNN = TEXTcnn.vocab.stoi[TEXTcnn.unk_token]

ModelCNN.embedding.weight.data[UNK_IDX_CNN] = torch.zeros(EMBEDDING_DIM)
ModelCNN.embedding.weight.data[PAD_IDX_CNN] = torch.zeros(EMBEDDING_DIM)

In [None]:
ModelCNN.to(DEVICE)
optimizer = torch.optim.Adam(ModelCNN.parameters(), lr=LR)

# Scheduler for the optimizer
total_steps = len(X_train_iter_cnn) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

criterion = nn.CrossEntropyLoss()
criterion.cuda()

In [None]:
best_valid_acc = 0.0
best_epoch = 0

# model 2 100 x 300, 2 layers, unidirectional --> 80.9
PATH = f"ModelCNN.pt" #50.8 %
if os.path.exists(PATH):
    print("Loading model from last checkpoint...")
    state = torch.load(PATH)
    ModelCNN.load_state_dict(state['best_state_dict'])
    best_valid_acc = state['best_valid_acc']
    best_epoch = state['epoch']
    has_checkpoint = True
print(f"Best Validations Accuracy so far: {best_valid_acc:.3f} at Epoch {best_epoch}\n")

for epoch in range(EPOCHS):
    real_epoch = best_epoch + epoch
    
    train_loss, train_acc = train(ModelCNN, X_train_iter_cnn, optimizer, criterion)
    valid_loss, valid_acc = evaluate(ModelCNN, X_val_iter_cnn, criterion)
    
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_state_dict = copy.deepcopy(ModelCNN.state_dict())
        print(f'Epoch {real_epoch: <{5}} | Train loss {train_loss:8.3f}| Train acc {train_acc:8.3f} | Valid loss {valid_loss:8.3f} | Valid acc {valid_acc:8.3f} | + ')
        
        #if not os.path.exists('Models'):
        #    os.makedirs('Models')
            # Let's create the checkpoint data to save
        checkpoint = {
            'epoch': real_epoch,
            'best_valid_acc': best_valid_acc,
            'best_state_dict': best_state_dict,
            'embedding_dim':ModelCNN.embedding,
            'n_filters':ModelCNN.n_filters,
            'filter_sizes':ModelCNN.filter_sizes
        }
        torch.save(checkpoint, PATH)
    else:
        print(f'Epoch {real_epoch: <{5}} | Train loss {train_loss:8.3f}| Train acc {train_acc:8.3f} | Valid loss {valid_loss:8.3f} | Valid acc {valid_acc:8.3f} |')

print(f"The best Model Accuracy: {best_valid_acc:.3f}")
print("The best Model has been saved")

## CNN Predictions

## Import test set

In [None]:
nlp = spacy.load("en_core_web_lg")
test_preprocessed = data_preprocess(x=text,y=test,ref=reference)
test_preprocessed.to_csv("test_prepared.csv",index=None)
test_preprocessed.head()

### Load the best CNN models save from checkpoint

In [None]:
state = torch.load(PATH)
ModelCNN.load_state_dict(state['best_state_dict'])
best_valid_acc = state['best_valid_acc']
print(best_valid_acc)

In [None]:
_, acc = evaluate(ModelCNN, X_val_iter_cnn, criterion)
print(f"CNN Validation Accuracy: {acc:.3f}")

### Make CNN Prediction

In [None]:
def predict_lstm(test,field, model):
    model.eval()
    processed = field.process([field.preprocess(test)])
    text,len_text = processed
    preds = model(text.cuda(),len_text).argmax().item()
    return preds

In [None]:
def predict_cnn(model, field, text):
    model.eval()
    text = field.preprocess(text)
    text = field.process([text])
    x = torch.tensor(text)
    x = x.cuda()
    logits = model(x)
    y_pred = torch.max(logits, dim=-1)[1]
    y_pred = y_pred.item()
    return y_pred

In [None]:
cnn_preds = [[test_preprocessed.id[i],predict_cnn(ModelCNN, TEXTcnn, test_preprocessed.AllCombined[i])] for i in range(len(test))]

In [None]:
submission_cnn = pd.DataFrame(data=cnn_preds,columns=['id','label'])

In [None]:
print(f"CNN Prediction length: {submission_cnn.shape[0]}")
print(f"Unique Labels Predicted: {list(submission_cnn.label.unique())}")
submission_cnn.head()

In [None]:
submission_cnn.to_csv("submission_cnn.csv",index=None)

# LSTM Model for the text classification

In [None]:
# Fields definition
TEXT_lstm = data.Field(
    sequential=True,
    lower=True,
    use_vocab=True,
    preprocessing=generate_n_grams,
    tokenize=spacy_tokenize,
    batch_first=True,
    include_lengths=True
)
LABEL_lstm = data.LabelField(sequential=False,use_vocab=False)

# We define the fields we need in our dataset file we need for the analysis
fields_lstm = [
    ("id",None),
    ("label",LABEL_lstm),
    ("title",None),
    ("AllCombined",TEXT_lstm)
]

# We import the train data file. Since it's in a csv file, we will use Tabular Dataset
textDataset = torchtext.data.TabularDataset(
    path="train_df.csv", 
    format="CSV",
    fields=fields_lstm,
    skip_header=True
)

In [None]:
# Dataset splitting
X_train_lstm, X_val_lstm = textDataset.split(
    split_ratio=0.2,
    random_state=random.seed(SEED)
)

# Let's build our vocab on the training set only with Glove 6B tokens and 300d vectors
# MAX_VOCAB_SIZE = 50000
TEXT_lstm.build_vocab(
    X_train_lstm,
    vectors=fast_text_vec,
    unk_init=torch.Tensor.normal_
)

# Find the padding index and the vocab size
padding_idx = TEXT_lstm.vocab.stoi["<pad>"]
vocab_size = len(TEXT_lstm.vocab)

In [None]:
# Let's create ou iterators
X_train_iter_lstm, X_val_iter_lstm = data.BucketIterator.splits(
    (X_train_lstm,X_val_lstm),
    batch_sizes=(32,32),
    sort_within_batch = True,
    sort_key=lambda x: len(x.AllCombined),
    device=torch.device('cuda')
)

In [None]:
ModelLSTM = LSTMClassifier(
    vocab_size=vocab_size,
    num_layers=4,
    embedding_dim=300,
    hidden_dim=250,
    num_label=5,
    padding_idx=padding_idx,
    dropout=0.4,
    bidirectional=True,
    device=torch.device('cuda')
)

In [None]:
#copying pre-trained word embeddings
pretrained_embeddings = TEXT_lstm.vocab.vectors
ModelLSTM.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT_lstm.vocab.stoi[TEXT_lstm.unk_token]
ModelLSTM.embedding.weight.data[UNK_IDX] = torch.zeros(300)
ModelLSTM.embedding.weight.data[padding_idx] = torch.zeros(300)

In [None]:
# CONFIGURATION
ModelLSTM.cuda()
LR = 1e-4
optimizer = Adam(ModelLSTM.parameters(), lr=LR)

total_steps = len(X_train_iter_lstm) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
criterion = nn.CrossEntropyLoss()
criterion.cuda()

In [None]:
best_valid_acc = 0.0
best_epoch = 0

# model 2 100 x 300, 2 layers, unidirectional --> 80.9
PATH = f"ModelLSTM.pt" #82.9 %
if os.path.exists(PATH):
    print("Loading model from last checkpoint...")
    state = torch.load(PATH)
    ModelLSTM.load_state_dict(state['best_state_dict'])
    best_valid_acc = state['best_valid_acc']
    best_epoch = state['epoch']
    has_checkpoint = True

for epoch in range(EPOCHS):
    real_epoch = best_epoch + epoch
    
    train_loss, train_acc = train_lstm(ModelLSTM, X_train_iter_lstm, optimizer, criterion, scheduler=None)
    valid_loss, valid_acc = evaluate_lstm(ModelLSTM, X_val_iter_lstm, criterion)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_state_dict = copy.deepcopy(ModelLSTM.state_dict())
        print(f'Epoch {real_epoch: <{5}} | Train loss {train_loss:8.3f}| Train acc {train_acc:8.3f} | Valid loss {valid_loss:8.3f} | Valid acc {valid_acc:8.3f} | + ')
        

        # Let's create the checkpoint data to save
        checkpoint = {
            'epoch': real_epoch,
            'best_valid_acc': best_valid_acc,
            'best_state_dict': best_state_dict,
            'vocab_size':ModelLSTM.vocab_size,
            'embedding_dim':ModelLSTM.embedding_dim,
            'num_layers':ModelLSTM.num_layers,
            'bidirectional':ModelLSTM.bidirectional,
            'hidden_dim': ModelLSTM.hidden_dim
        }
        torch.save(checkpoint, PATH)
    else:
        print(f'Epoch {real_epoch: <{5}} | Train loss {train_loss:8.3f}| Train acc {train_acc:8.3f} | Valid loss {valid_loss:8.3f} | Valid acc {valid_acc:8.3f} |')
    
    # Tensorboard section
print(f"The best Model Accuracy: {best_valid_acc:.3f}")
print("The best Model has been saved")

## LSTM Predictions

In [None]:
PATH = f"ModelLSTM.pt" #82.9 %
state = torch.load(PATH)
ModelLSTM.load_state_dict(state['best_state_dict'])
best_valid_acc = state['best_valid_acc']
print(best_valid_acc)

In [None]:
def predict_lstm(test,field, model):
    model.eval()
    processed = field.process([field.preprocess(test)])
    text,len_text = processed
    preds = model(text.cuda(),len_text).argmax().item()
    return preds

In [None]:
lstm_preds = [[test_preprocessed.id[i],predict_lstm(test_preprocessed.AllCombined[i],TEXT_lstm,ModelLSTM)] for i in range(len(test))]

In [None]:
submission_lstm = pd.DataFrame(data=lstm_preds,columns=['id','label'])
submission_lstm.to_csv('ssubmission_lstm',index=None)
submission_lstm.head()

# Transformer BERT model

## Dataset propcessing

In [None]:
class TextClfDataset:
    def __init__(self, obs_id, text, label, tokenizer, max_len=512):
        """
        Class for preparing text data for text classification / sentiment analysis task with BERT
        :param text: Text to process
        :param tokenizer: Text tokenizer
        :param max_len: The max len for the padding
        :param label: The label to predict in the task
        """
        self.text = text
        self.label = label
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.id = obs_id

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len
        )

        input_ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        # padding
        padding_len = self.max_len - len(input_ids)
        input_ids = input_ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        
        if self.label is not None:
            out = {
                "input_ids": torch.tensor(input_ids),
                "mask": torch.tensor(mask),
                "token_type_ids": torch.tensor(token_type_ids),
                "obs_id": torch.tensor(self.id[item]),
                "label": torch.tensor(self.label[item])
            }
            return out
        else:
            out = {
                "input_ids": torch.tensor(input_ids),
                "mask": torch.tensor(mask),
                "token_type_ids": torch.tensor(token_type_ids),
                "obs_id": torch.tensor(self.id[item])
            } 
            return out

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, bert_path, dropout, n_class):
        super(BertClassifier, self).__init__()
        self.bert_path = bert_path
        self.n_class = n_class
        self.bert = BertModel.from_pretrained(self.bert_path)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.bert.config.hidden_size, self.n_class)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        out = self.dropout(out)
        out = self.fc(out)
        return out

In [None]:
def train_bert(data_loader, model, optimizer, device, criterion, scheduler=None):
    total_loss, total_correct, total_prediction = 0.0, 0.0, 0.0
    model.train()
    for bi, d in enumerate(data_loader):
        # Unpack the training batch
        ids = d["input_ids"].to(device)
        mask = d["mask"].to(device)
        token_type_ids = d["token_type_ids"].to(device)
        label = d["label"].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=ids, 
            attention_mask=mask, 
            token_type_ids=token_type_ids, 
            labels=label
        )
        loss = outputs[0]
        logits = outputs[1]
        predictions = torch.max(logits, dim=-1)[1]
        # loss = criterion(logits,label)
        loss.backward()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        total_loss += loss.item()
        total_correct += torch.eq(predictions, label).sum().item()
        total_prediction += label.size(0)
    avg_train_loss = total_loss / len(data_loader)  # Average train loss over the all the batches
    avg_train_acc = total_correct / total_prediction  # Averag train accuracy over the whole train set
    return avg_train_loss, avg_train_acc

In [None]:
def eval_bert(data_loader, model, device, criterion):
    total_loss, total_correct, total_prediction = 0.0, 0.0, 0.0
    model.eval()
    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d["input_ids"].to(device)
            mask = d["mask"].to(device)
            token_type_ids = d["token_type_ids"].to(device)
            label = d["label"].to(device)
           
            outputs = model(
                input_ids=ids, 
                attention_mask=mask, 
                token_type_ids=token_type_ids, 
                labels=label
            )
            
            loss = outputs[0]
            logits = outputs[1]
            
            predictions = torch.max(logits, dim=-1)[1]
            # loss = criterion(logits, label)
            
            total_loss += loss.item()
            total_correct += torch.eq(predictions, label).sum().item()
            total_prediction += label.size(0)
    avg_valid_loss = total_loss / len(data_loader)
    avg_valid_acc = total_correct / total_prediction
    return avg_valid_loss, avg_valid_acc

In [None]:
def run():
    MAX_LEN = 512
    TRAIN_BATCH_SIZE = 10
    EPOCHS = 5
    DROPOUT = 0.4

    df = pd.read_csv("train_df.csv", usecols=['label', 'title','AllCombined'])
    df_train, df_valid = train_test_split(df, test_size=0.2, random_state=42)

    df_train.reset_index(drop=True)
    df_valid.reset_index(drop=True)

    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

    train_dataset = TextClfDataset(
        obs_id = df_train.id.values,
        text=df_train.AllCombined.values,
        label=df_train.label.values,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    train_data_loader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        batch_size=TRAIN_BATCH_SIZE,
    )

    valid_dataset = TextClfDataset(
        obs_id = df_train.id.values,
        text=df_valid.AllCombined.values,
        label=df_valid.label.values,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    valid_data_loader = DataLoader(
        valid_dataset,
        sampler=SequentialSampler(valid_dataset),
        batch_size=TRAIN_BATCH_SIZE
    )

    device = torch.device("cuda")
    num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE)
    # model = BERTClassifier(dropout=DROPOUT)
    ModelBERT = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=5)
    #ModelBERT = BertClassifier(
    #    "bert-base-uncased",
    #    dropout=0.4,
    #    n_class=5
    #)
    
    ModelBERT.cuda()
    
    print(
        f"Nbr of parameters before freezing bert layers: "
        f"{sum(p.numel() for p in ModelBERT.parameters() if p.requires_grad)}"
    )
    # We freeze all the bert layers (encoder and embeddings layers)
    for name, param in ModelBERT.named_parameters():
        if name.startswith('bert'):
            param.requires_grad = False
    print(
        f"Nbr of parameters After freezing bert layers: "
        f"{sum(p.numel() for p in ModelBERT.parameters() if p.requires_grad)}"
    )
    
    
    criterion = nn.CrossEntropyLoss()
    criterion.to(device)
    lr = 2e-3
    optimizer = AdamW(ModelBERT.parameters(), eps = 1e-8, lr=lr)
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    best_valid_acc = 0.0
    best_epoch = 0
    
    PATH = f"ModelBERT.pt" #50.8 %
    if os.path.exists(PATH):
        print("Loading model from last checkpoint...")
        state = torch.load(PATH)
        ModelBERT.load_state_dict(state['best_state_dict'])
        best_valid_acc = state['best_valid_acc']
        best_epoch = state['epoch']
        has_checkpoint = True
    print(f"Best Validations Accuracy so far: {best_valid_acc:.3f} at Epoch {best_epoch}\n")

    # Measure the total training time for the whùole run.
    total_t0 = time.time()
    
    for epoch in range(EPOCHS):
        real_epoch = best_epoch + epoch
        # Measure how long the training epoch takes.
        t0 = time.time()
        
        train_loss, train_acc = train_bert(train_data_loader, ModelBERT, optimizer, device, criterion, scheduler)
        valid_loss, valid_acc = eval_bert(valid_data_loader, ModelBERT, device, criterion)
        
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)
        
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            best_state_dict = copy.deepcopy(ModelBERT.state_dict())
            print(f'Epoch {real_epoch: <{5}} | Elapsed Time {training_time: <{10}} | Train loss {train_loss:8.3f}| Train acc {train_acc:8.3f} | Valid loss {valid_loss:8.3f} | Valid acc {valid_acc:8.3f} | + ')

            # Let's create the checkpoint data to save
            checkpoint = {
                'epoch': real_epoch,
                'best_valid_acc': best_valid_acc,
                'best_state_dict': best_state_dict,
            }
            torch.save(checkpoint, PATH)
        else:
            print(f'Epoch {real_epoch: <{5}} | Elapsed Time {training_time: <{10}} | Train loss {train_loss:8.3f}| Train acc {train_acc:8.3f} | Valid loss {valid_loss:8.3f} | Valid acc {valid_acc:8.3f} |')
    
    # End of training
    print(f"The best Model Accuracy: {best_valid_acc:3.3f}")
    print("The best Model has been saved")

In [None]:
run()

## BERT Predictions

In [None]:
ModelBERT = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=5)
PATH = f"ModelBERT.pt" #82.9 %
state = torch.load(PATH)
ModelBERT.load_state_dict(state['best_state_dict'])
best_valid_acc = state['best_valid_acc']
print(best_valid_acc)

In [None]:
test_preprocessed.head()

In [None]:
def predict_bert(data_loader, model, device):
    y_pred, pred_ids = [], []
    model.eval()
    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            print(f"Step {bi}...")
            ids = d["input_ids"].to(device)
            mask = d["mask"].to(device)
            obs_ids = d["obs_id"].to(device)
            token_type_ids = d["token_type_ids"].to(device)
            
            obs_ids = obs_ids.to('cpu').numpy().tolist()
            pred_ids.append(obs_ids)
            
            output = model(
                input_ids=ids, 
                attention_mask=mask, 
                token_type_ids=token_type_ids
            )
            logits = output[0]
            
            preds = torch.max(logits, dim=1)[1]
            preds = preds.detach().cpu().numpy().tolist()
            y_pred.append(preds)
            
    return (y_pred, pred_ids)

In [None]:
def remove_nestings(ls):
    """
    Function to flattern a nested list of 2 levels
    :param ls: Nested List
    :return: Flattern List
    """
    output = []
    for i in ls:
        for j in i:
            output.append(j)
    return output

In [None]:
def predict_test(df_test, model):
    MAX_LEN = 512
    TEST_BATCH_SIZE = 300
    
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
    
    test_dataset = TextClfDataset(
        obs_id = df_test.id.values,
        text=df_test.AllCombined.values,
        label=None,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    test_data_loader = DataLoader(
        test_dataset,
        sampler=RandomSampler(test_dataset),
        batch_size=TEST_BATCH_SIZE,
    )
    
    DEVICE = torch.device("cuda")
    model.cuda()
    
    t0 = time.time()
    
    print(f"Total Step: {len(test_data_loader)}")
    
    y_pred, pred_ids = predict_bert(test_data_loader, model, DEVICE)
    prediction_time = format_time(time.time() - t0)
    y_pred = remove_nestings(y_pred)  # Flattern the nested list
    pred_ids = remove_nestings(pred_ids)
    print(f"Elapsed {prediction_time: <{6}} | Test Data Size {df_test.shape} | Predictions List Size {len(y_pred)} | IDs List Size {len(pred_ids)}")
    
    return (y_pred, pred_ids)

In [None]:
out = predict_test(test_preprocessed,ModelBERT)

In [None]:
y_pred_final = [[out[1][i], out[0][i]] for i in range(len(out[0]))]

In [None]:
submission_bert = pd.DataFrame(data=y_pred_final,columns=['id','label'])
submission_bert.to_csv('submission_bert',index=None)
submission_bert.head()