In [None]:
import os
#specifying wich gpu to use because there is no option for that in Trainer 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from torch import nn
from transformers import BertModel
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification

In [None]:
tokenizer = AutoTokenizer.from_pretrained('SZTAKI-HLT/hubert-base-cc')

In [None]:
path_train = os.path.join("train_short_impossible.csv")
train_df = pd.read_csv("train_short_impossible.csv", sep = ";", index_col=0, na_filter=False)
# 

path_dev = os.path.join("dev_short_impossible.csv")
dev_df = pd.read_csv("dev_short_impossible.csv", sep = ";", index_col=0, na_filter=False)
# 

In [None]:
train_df['is_impossible'] = train_df['is_impossible'].apply(int)

In [None]:
dev_df['is_impossible'] = dev_df['is_impossible'].apply(int)

In [None]:
max_length = 384
doc_stride = 128
pad_on_right = tokenizer.padding_side == "right"
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        padding="max_length",
    )


    return tokenized_examples

In [None]:
def tokenize_text(text):
        input_1 = text["question"] if pad_on_right else text["context"]
        input_2 = text["context"] if pad_on_right else text["question"]
        truncation = "only_second" if pad_on_right else "only_first"
        return tokenizer(input_1, input_2, truncation=truncation, max_length=max_length, stride=doc_stride, padding="max_length", return_tensors="pt")


class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [label for label in df['is_impossible']]
        df["question"] = [q.lstrip() for q in df["question"]]
        self.texts = self.tokenize_texts(df, tokenizer, max_length, doc_stride, pad_on_right)
        # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.

    def tokenize_texts(self,train_df, tokenizer, max_length, doc_stride, pad_on_right=False):
        texts = [{"question": row["question"], "context": row["context"]} for i, row in train_df.iterrows()]
        return [tokenize_text(text) for text in texts]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('SZTAKI-HLT/hubert-base-cc')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=16)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = torch.tensor(train_input['attention_mask']).to(device)
                input_id = torch.tensor(train_input['input_ids']).squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')

EPOCHS = 5
model = BertClassifier()
LR = 2e-5

train(model, train_df, dev_df, LR, EPOCHS)