## Kaggle Tweet Sentiment Extraction

This notebook is built on the awesome tutorial made by Abhishek [here](https://www.youtube.com/watch?v=XaQ0CBlQ4cY). Thanks man!!

## Colab_setup

In [None]:
from pathlib import Path
import os
from google.colab import drive

In [None]:
def create_path(path):
    if not os.path.isdir(path):
        path.mkdir(parents=True, exist_ok=True)
    return path

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_dir = Path('/content/drive/My Drive')
base_path = create_path(root_dir/'Kaggle_Twitter_Bert')
base_path

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


PosixPath('/content/drive/My Drive/Kaggle_Twitter_Bert')

In [None]:
colab_path = Path('/content')

In [None]:
data_path = create_path(base_path/'dataset')

In [None]:
model_path = create_path(base_path/'models')

In [None]:
bert_path = (create_path(colab_path/'input/bert_uncased'))

In [None]:
############join all above to below

## Download Data

In [None]:
url = "'https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/16295/1099992/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1587716090&Signature=nKyFPV0iiqPX3alWIj8DDbm2kpW%2Bb2jWgFU06dHIIFFC5%2FIif380Zs32%2F7FM4pMD3xNgGGsYcnZVhxbyn1T7FQfsUZ%2B5SeEIHn%2BEeOVAOrz1GVJs9fzVQ%2B%2FTHGfhvgJaOzejXcwONLyUAH2%2FkRnRvrq%2Bx6ghUwDLHeKnjmQpVYGztyFlX3eGNHBmKVvjmtGVhnZp28jx7lQ0qoVOSE2%2BoGYCsByw%2BK9Rqh8SNQW8NEt6TGQ%2B0HOCJB8tWwio96qZ%2Byyq3tVqaZLD8%2BdFi14MotnsnpvuiE84AmZTdtFa%2FnH4s6aV%2BCtBS%2FPYYrmM9nUmrTcZeGMEblMm17C93NzSGQ%3D%3D&response-content-disposition=attachment%3B+filename%3Dtweet-sentiment-extraction.zip'"

In [None]:
# uncomment this if running for first time

# os.chdir(data_path)
# !wget -q {str(url)} -O temp.zip && unzip -q temp.zip && rm 'temp.zip'
# os.chdir(colab_path)

##Download Vocab

In [None]:
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"

In [None]:
os.chdir(bert_path)
!wget -q {str(url)} -O 'vocab.txt'
os.chdir(colab_path)

## Config

In [None]:
!pip -q install transformers
!pip -q install tokenizers

[K     |████████████████████████████████| 573kB 4.9MB/s 
[K     |████████████████████████████████| 3.7MB 22.1MB/s 
[K     |████████████████████████████████| 1.0MB 50.8MB/s 
[K     |████████████████████████████████| 890kB 48.6MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import transformers
import tokenizers

class Config():
    def __init__(self):
        self.MAX_LEN = 512
        self.SAVE_MODEL_PATH = str(model_path/'finetuned-bert.pth')
        self.DATA_PATH = str(data_path/'train_fold.csv')
        self.BERT_PATH = str(bert_path/'finetuned-bert.pth')
        self.VOCAB_PATH = str(bert_path/'vocab.txt')
        self.TRAIN_BATCH_SIZE = 8
        self.VALID_BATCH_SIZE = 4
        self.NUM_EPOCHS = 10
        self.MODEL_NAME = 'bert-base-uncased'
        # self.TOKENIZER = transformers.BertTokenizer.from_pretrained(
        #     pretrained_model_name_or_path=self.MODEL_NAME,
        #     do_lower_case=True,
        #     force_download = True,
        # )
        self.TOKENIZER = tokenizers.BertWordPieceTokenizer(
            self.VOCAB_PATH, 
            lowercase=True
        )

config = Config()

##Dataset

In [None]:
import torch
# Handling this problem as a question answering problem means that we take in 
# question <sentiment> and we expect an answer <extracted_text> from the original tweet
def process_data(tweet, extracted_text, sentiment, tokenizer, max_len):
    len_et = len(extracted_text)

    # we have to set the ids at where the extracted text starts and ends in the original tweet text
    idx_start, idx_end = 0, 0

    # check where the exctracted text starts in the main tweet and then extract the id
    # NB: This should be done before tokenization so as account for broken words or incompleted extractions
    for idx in (i for i, word in enumerate(tweet) if word == extracted_text[0]):
        # check if the complete extracted word is a subset of the tweet using 
        # the lenght of the extrcated text
        if tweet[idx:idx+len_et] == extracted_text:
            idx_start = idx
            idx_end = idx + len_et - 1
            break


    # we are trying to use n-hot encoding to indicate where the tokens from the 
    # extracted text match the original tweet
    n_hot_enc = [0] * len(tweet)

    # sanity check
    if idx_start!=None and idx_end!=None:
        for ind in range(idx_start, idx_end+1):
            n_hot_enc[ind] = 1

    # now we can tokenize the tweet. Remember that Bert adds [CLS] and [SEP] tokens to the sentence
    tweet_token = tokenizer.encode(tweet)
    # we want to remove the [CLS] and [SEP] tokens at the start and end of the sentence
    token_ids = tweet_token.ids[1:-1]
    tweet_offsets = tweet_token.offsets[1:-1]

    targets = []
    for i, (start_off, end_off) in enumerate(tweet_offsets):
        # get the extracted text ids from the original tweet text
        if sum(token_ids[start_off:end_off]) > 0:
            targets.append(i)
    
    # get the indexes where the extracted text(answers) started from and ended
    target_start = targets[0]
    target_end = targets[-1]


    # extract the tokens for the questions
    sentiment_dict = {
        'neutral' : config.TOKENIZER.encode('Neutral').ids[1],
        'positive' : config.TOKENIZER.encode('Positive').ids[1],
        'negative' : config.TOKENIZER.encode('Negative').ids[1],
    }

    # the [CLS] and [SEP] token ids
    CLS = tokenizer.encode('[CLS]').ids[1]
    SEP = tokenizer.encode('[SEP]').ids[1]

    # question answering systems for transformer(Bert) arch take the form
    # [cls]<question_tokens>[sep]<answer_token>[sep]
    # then the token_type_ids are
    # 00011 where the 0 stops at the middle [sep] token shown above 
    qa_inputs = [CLS] + [sentiment_dict[sentiment]] + [SEP] + token_ids + [SEP]
    token_ids = [0,0,0] + ([1] * len(token_ids + [0]))
    tweet_offsets = [(0, 0)] * 3 + tweet_offsets + [(0, 0)]
    mask = [1] * len(token_ids)
    #offset the targets considering the [CLS] + sentiment_dict[sentiment] + [SEP] tokens
    target_start += 3
    target_end += 3

    # now let's handle padding
    padding_sz = max_len - len(token_ids)
    
    if padding_sz>0:
        # bert uses 0 padding
        qa_inputs = qa_inputs + ([0] * padding_sz)
        token_ids = token_ids + ([0] * padding_sz)
        tweet_offsets = tweet_offsets + ([(0,0)] * padding_sz)
        mask = mask + ([0] * padding_sz)

    

    return {
        'input_ids': torch.tensor(qa_inputs).long(), 
        'token_ids': torch.tensor(token_ids).long(),
        'mask': torch.tensor(mask).long(),
        'target_start': torch.tensor(target_start).long(),
        'target_end': torch.tensor(target_end).long(),
        'tweet_offsets': torch.tensor(tweet_offsets).long(),
        'tweet': tweet,
        'extracted_text': extracted_text,
        'sentiment': sentiment,
    }


class TwitterDataset():
    def __init__(self, tweet, extracted_text, sentiment, tokenizer, max_len):
        (self.tweet, self.extracted_text, self.sentiment, self.tokenizer, self.max_len) = \
        (tweet, extracted_text, sentiment, tokenizer, max_len)

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, idx):
        # sanity check
        tweet = str(self.tweet[idx])
        tweet = ' '.join(tweet.split())
        extracted_text = str(self.extracted_text[idx])
        extracted_text = ' '.join(extracted_text.split())

        return process_data(tweet, extracted_text, self.sentiment[idx], 
                     self.tokenizer, self.max_len)

##Model

In [None]:
import torch.nn as nn
import transformers

# make a pythorch model
class Bert(nn.Module):
    def __init__(self, xtra_config):
        super(Bert, self).__init__()
        # load a pretrained bert model arch
        self.bert = transformers.BertModel.from_pretrained(config.MODEL_NAME, config=xtra_config)
        #  dropout should be applied
        self.drop = nn.Dropout(0.3)
        # a classifier head should be placed. It will give us the strat and end 
        # of the extracted token from the orig
        self.head = nn.Linear(768*2, 2) #double because of concatenation    

        # use a noraml init or any of your choice
        torch.nn.init.normal_(self.head.weight, std=0.02)

    def forward(self, stoi, mask, token_type_ids):
        # according to the docs, it is better to use the hidden_state output 
        # which is gotten by setting the config.output_hidden_states = True
        # 13 * (bs, seq_len, 768) for hidden_states
        _, _, hidden_states = self.bert(input_ids=stoi, 
                                        attention_mask=mask, 
                                        token_type_ids=token_type_ids)
        
        # by default, bert has 13 hidden states. it is advised here 
        # https://bert-as-service.readthedocs.io/en/latest/section/faq.html#why-not-the-last-hidden-layer-why-second-to-last 
        # to use the last 2 instead of just the last for tasks other than MLM or NSP
        # since we have two outputs for start and stop, we'll have two hidden states
        h0, h1 = hidden_states[-2], hidden_states[-1] 

        # (bs, seq_len, 768*2)
        logits = torch.cat((h0, h1), dim=-1)

        # pass logits into dropout
        logits = self.drop(logits)

        # pass into classifier head so (bs, seq_len, 2)
        logits = self.head(logits)

        # we can now split this output to get our start and end for the targets
        # split takes the chunk size for the specified dimension. So take a chunk size of 1 at the last dim
        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

##Train_utils

In [None]:
from tqdm import notebook
import numpy as np

def loss_func(pred_target_start, pred_target_end, actual_start, actual_end):
    loss = nn.CrossEntropyLoss()
    start_loss = loss(pred_target_start, actual_start)
    end_loss = loss(pred_target_end, actual_end)
    total_loss = start_loss + end_loss
    return total_loss

def train_fn(dataloader, model, optimizer, scheduler, device):
    model.train()

    # we use averagemeter to find the average of the losses
    jaccard_all = AverageMeter()
    loss_all = AverageMeter()
    tk0 = notebook.tqdm(dataloader, total=len(dataloader))

    for i, data in enumerate(tk0):

        input_ids = data['input_ids']
        token_ids = data['token_ids']
        mask = data['mask']
        target_start = data['target_start']
        target_end = data['target_end']
        tweet_offsets = data['tweet_offsets']
        tweet = data['tweet']
        extracted_text = data['extracted_text']
        sentiment = data['sentiment']

        # put em on the device
        input_ids = input_ids.to(device).long()
        token_ids = token_ids.to(device).long()
        mask = mask.to(device).long()
        target_start = target_start.to(device).long()
        target_end = target_end.to(device).long()
        tweet_offsets = tweet_offsets.to(device).long()

        # zero grad in model if any
        model.zero_grad()

        # push it into the model
        pred_start, pred_end = model(
            stoi = input_ids, 
            mask = mask, 
            token_type_ids = token_ids
        )

        # calc loss
        loss = loss_func(pred_start, pred_end, target_start, target_end)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # calculate jaccard
        # we have to convert the predicted logits to probability values for each targ
        pred_start_probs = torch.softmax(pred_start, dim=1).cpu().detach().numpy()
        pred_end_probs = torch.softmax(pred_end, dim=1).cpu().detach().numpy()

        jaccards = []

        for idx, tweet_ in enumerate(tweet):
            pred_start_prob = np.argmax(np.array(pred_start_probs)[idx, :])
            pred_end_prob = np.argmax(np.array(pred_end_probs)[idx, :])

            jaccard, _ = calculate_jaccard(tweet_, extracted_text[idx], sentiment[idx], 
                                  tweet_offsets[idx], pred_start_prob, pred_end_prob)
            jaccards.append(jaccard)

        # use averagemeter to update
        jaccard_all.update(np.mean(jaccards), input_ids.size(0)) #bs
        loss_all.update(loss.item(), input_ids.size(0)) #bs

        # show avg loss after every iter
        tk0.set_postfix(loss=loss_all.avg)

In [None]:
def eval_fn(dataloader, model, device):
    model.eval()

    # we use averagemeter to find the average of the losses
    jaccard_all = AverageMeter()
    loss_all = AverageMeter()

    with torch.no_grad():
        tk0 = notebook.tqdm(dataloader, total=len(dataloader))
        for i, data in enumerate(tk0):

            input_ids = data['input_ids']
            token_ids = data['token_ids']
            mask = data['mask']
            target_start = data['target_start']
            target_end = data['target_end']
            tweet_offsets = data['tweet_offsets']
            tweet = data['tweet']
            extracted_text = data['extracted_text']
            sentiment = data['sentiment']

            # put em on the device
            input_ids = input_ids.to(device).long()
            token_ids = token_ids.to(device).long()
            mask = mask.to(device).long()
            target_start = target_start.to(device).long()
            target_end = target_end.to(device).long()
            tweet_offsets = tweet_offsets.to(device).long()

            # push it into the model
            pred_start, pred_end = model(
                stoi = input_ids, 
                mask = mask, 
                token_type_ids = token_ids
            )

            # calc loss
            loss = loss_func(pred_start, pred_end, target_start, target_end)

            # calculate jaccard
            # we have to convert the predicted logits to probability values for each targ
            pred_start_probs = torch.softmax(pred_start, dim=1).cpu().detach().numpy()
            pred_end_probs = torch.softmax(pred_end, dim=1).cpu().detach().numpy()

            jaccards = []
            for idx, tweet_ in enumerate(tweet):
                pred_start_prob = np.argmax(pred_start_probs[idx, :])
                pred_end_prob = np.argmax(pred_end_probs[idx, :])

                jaccard, _ = calculate_jaccard(tweet_, extracted_text[idx], sentiment[idx], 
                                    tweet_offsets[idx], pred_start_prob, pred_end_prob)
                jaccards.append(jaccard)

            # use averagemeter to update
            jaccard_all.update(np.mean(jaccards), input_ids.size(0)) #bs
            loss_all.update(loss.item(), input_ids.size(0)) #bs

            # show avg loss after every iter
            tk0.set_postfix(loss=loss_all.avg)
    
    print(f'Jaccard = {jaccard_all.avg}')
    return jaccard_all.avg

##Utils

In [None]:
class AverageMeter():
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def jaccard_score(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


In [None]:
def calculate_jaccard(orig_tweet, extracted_text, sentiment, offsets, pred_start, pred_end):

    # instantiate the predictions we expect
    pred_extract =  ''

    # we've established that the extracted text for neutral tweets is usually the 
    # entire tweet. We also know that for tweets with at most 2 charcters, 
    # the extract will be from those chars 
    if sentiment == 'neutral' and len(orig_tweet.split()) < 2:
        pred_extract = orig_tweet
    else:
        # sanity
        if pred_start > pred_end:
            pred_end = pred_start #zero things out

        for i in range(pred_start, pred_end+1):
            # use the offsets to get the predicted extracted text
            pred_extract += orig_tweet[offsets[i][0]:offsets[i][1]]

            # add spaces between the extracts for clarity 
            if i+1 < len(offsets) and offsets[i][1] < offsets[i+1][0]:
                pred_extract = ' '

    jaccard = jaccard_score(pred_extract, extracted_text)

    return jaccard, pred_extract

In [None]:
class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

##Stratify

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

df = pd.read_csv(data_path/'train.csv').dropna().reset_index(drop=True)
df['kfold'] = -1

# sample the full data to add shuffling
df = df.sample(frac=1.)

# use 5 folds
kf = StratifiedKFold(n_splits=5)

for fold, (train, valid) in enumerate(kf.split(X=df, y=df.sentiment.values)):
    print(len(train), len(valid), fold)
    df.loc[valid, 'kfold'] = fold

df.to_csv(data_path/'train_fold.csv')

21984 5496 0
21984 5496 1
21984 5496 2
21984 5496 3
21984 5496 4


##Train

In [None]:
model_config = transformers.BertConfig.from_pretrained(config.MODEL_NAME)
model_config.output_hidden_states = True

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import torch.optim as optim
import gc

def run(fold):
    dfx = pd.read_csv(config.DATA_PATH)
    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_data = TwitterDataset(df_train.text.values, df_train.selected_text.values, 
                            df_train.sentiment.values, config.TOKENIZER, config.MAX_LEN)
    
    valid_data = TwitterDataset(df_valid.text.values, df_valid.selected_text.values, 
                            df_valid.sentiment.values, config.TOKENIZER, config.MAX_LEN)
    
    # make dataloaders
    train_dataloader = DataLoader(train_data, batch_size=config.TRAIN_BATCH_SIZE, 
                                  shuffle=True, num_workers=0)
    
    valid_dataloader = DataLoader(valid_data, batch_size=config.VALID_BATCH_SIZE, 
                                  shuffle=True, num_workers=0)
    
    # model_config = transformers.BertConfig.from_pretrained(config.MODEL_NAME)
    # model_config.output_hidden_states = True

    # set the device
    device = torch.device('cuda')
    
    gc.collect()
    model = Bert(xtra_config=model_config).to(device)

    # parmas you want optimized
    param_optimizer = list(model.named_parameters())

    # we don't want weight decay for these
    no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']

    optimizer_params = [
        {'params': [p for n, p in param_optimizer if n not in no_decay], 
         'weight_decay':0.001},
        #  no weight decay should be applied
        {'params': [p for n, p in param_optimizer if n in no_decay],
         'weight_decay':0.0}
    ]

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.NUM_EPOCHS)
    optimizer = AdamW(optimizer_params, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer = optimizer,
        num_training_steps = num_train_steps,
        # no warmup
        num_warmup_steps = 0 
    )

    es = EarlyStopping(patience=2, mode='max')
    print(f'\n Training starting for fold {fold}')

    for epoch in range(config.NUM_EPOCHS):
        gc.collect()
        train_fn(train_dataloader, model, optimizer, scheduler, device)
        gc.collect()
        jaccard = eval_fn(valid_dataloader, model, device)
        es(jaccard, model, model_path=model_path/f'finetunedmodel_{fold}.pth')
        if es.early_stop:
            print('Early Stopping')
            break

In [None]:
gc.collect()

709

In [None]:
run(0)


 Training starting for fold 0


HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1374), HTML(value='')))


Jaccard = 0.007563465352897634
Validation score improved (-inf --> 0.007563465352897634). Saving model!


HBox(children=(IntProgress(value=0, max=2748), HTML(value='')))

##Inference

In [None]:
df_test = pd.read_csv(data_path/'test.csv')
# fill with dummy
df_test['selected_test'] = df_test.text.values

In [None]:
device = torch.device('cuda')

In [None]:
model0 = Bert(xtra_config=model_config).to(device)
model0.load_state_dict(torch.load(model_path/f'finetunedmodel_0.pth'))
# put in eval mode for inference
model0.eval()

model1 = Bert(xtra_config=model_config).to(device)
model1.load_state_dict(torch.load(model_path/f'finetunedmodel_1.pth'))
# put in eval mode for inference
model1.eval()

model2 = Bert(xtra_config=model_config).to(device)
model2.load_state_dict(torch.load(model_path/f'finetunedmodel_2.pth'))
# put in eval mode for inference
model2.eval()

model3 = Bert(xtra_config=model_config).to(device)
model3.load_state_dict(torch.load(model_path/f'finetunedmodel_3.pth'))
# put in eval mode for inference
model3.eval()

model4 = Bert(xtra_config=model_config).to(device)
model4.load_state_dict(torch.load(model_path/f'finetunedmodel_4.pth'))
# put in eval mode for inference
model4.eval()

In [None]:
test_data = TwitterDataset(df_test.text.values, df_test.selected_text.values, 
                            df_test.sentiment.values, config.TOKENIZER, config.MAX_LEN)
    
# make dataloaders
test_dataloader = DataLoader(test_data, batch_size=config.TRAIN_BATCH_SIZE, 
                                shuffle=True, num_workers=0)

In [None]:
prediction = []

with torch.no_grad():
    tk0 = notebook.tqdm(test_dataloader, total=len(test_dataloader))

    for i, data in enumerate(tk0):
        input_ids = data['input_ids']
        token_ids = data['token_ids']
        mask = data['mask']
        target_start = data['target_start']
        target_end = data['target_end']
        tweet_offsets = data['tweet_offsets']
        tweet = data['tweet']
        extracted_text = data['extracted_text']
        sentiment = data['sentiment']

        # put em on the device
        input_ids = input_ids.to(device).long()
        token_ids = token_ids.to(device).long()
        mask = mask.to(device).long()
        target_start = target_start.to(device).long()
        target_end = target_end.to(device).long()
        tweet_offsets = tweet_offsets.to(device).long()

        # push it into the model
        pred_start0, pred_end0 = model0(
            stoi = input_ids, 
            mask = mask, 
            token_type_ids = token_ids
        )

        pred_start1, pred_end1 = model1(
            stoi = input_ids, 
            mask = mask, 
            token_type_ids = token_ids
        )

        pred_start2, pred_end2 = model2(
            stoi = input_ids, 
            mask = mask, 
            token_type_ids = token_ids
        )

        pred_start3, pred_end3 = model3(
            stoi = input_ids, 
            mask = mask, 
            token_type_ids = token_ids
        )

        pred_start4, pred_end4 = model4(
            stoi = input_ids, 
            mask = mask, 
            token_type_ids = token_ids
        )


        # Ensemble by taking averages
        avg_pred_start = (pred_start0, pred_start1, pred_start2, pred_start3, pred_start4) / 5
        avg_pred_end = (pred_end0, pred_end1, pred_end2, pred_end3, pred_end4) / 5

        # we have to convert the predicted logits to probability values for each targ
        pred_start_probs = torch.softmax(avg_pred_start, dim=1).cpu().detach().numpy()
        pred_end_probs = torch.softmax(avg_pred_end, dim=1).cpu().detach().numpy()

        for idx, tweet_ in enumerate(tweet):
            pred_start_prob = np.argmax(pred_start_probs[idx, :])
            pred_end_prob = np.argmax(pred_end_probs[idx, :])

            _, pred_extract = calculate_jaccard(tweet_, extracted_text[idx], sentiment[idx], 
                                tweet_offsets[idx], pred_start_prob, pred_end_prob)
            
            prediction.append(pred_extract)

In [None]:
def post_process(selected):
    return " ".join(set(selected.lower().split()))

In [None]:
sample = pd.read_csv(data_path/"sample_submission.csv")
sample.loc[:, 'selected_text'] = final_output
sample.selected_text = sample.selected_text.map(post_process)
sample.to_csv("submission.csv", index=False)