In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm.auto import tqdm

In [None]:
#find the starting and ending positions of the masked data
def get_mask(text, subtext):
    start = text.find(subtext)
    end = start + len(subtext)
    return {"start": len(text[:start].split()), "end": len(text[:end].split())}
#see DistilBert GAN for details
class TweetaSet(Dataset):
    def __init__(self, filename, tokenizer, max_len, frac=1.0):
        self.df = pd.read_csv(filename).dropna().sample(frac=frac)
        self.sentiment_to_idx = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, item):
        textID, text, selected_text, sentiment = self.df.iloc[item]
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )


        mask = get_mask(text, selected_text)
        return {
            'tweet': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'output_mask': mask,
            'sentiment':self.sentiment_to_idx[sentiment]
        }

In [None]:
#See Report for Details
class BERTRNNFineTuner(nn.Module):
    def __init__(self, max_len, hidden):
        super(BERTRNNFineTuner,self).__init__()
        self.bert      = DistilBertModel.from_pretrained('/kaggle/input/distillbert-base-cased/hf-distillbert-base-cased', output_attentions=True)
        self.recurrent = nn.LSTM(self.bert.config.hidden_size,128,3)
        self.dense     = nn.Linear(128,128)
        self.hidden    = hidden
        nn.init.normal_(hidden[0])
        nn.init.normal_(hidden[1])
        self.relu      = nn.ReLU()
        self.lin       = nn.Linear(129,2)
        self.max_len = max_len
        for name, param in self.recurrent.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_normal_(param)
    def forward(self, input_ids, attn_mask, sentiment):
        x = self.bert(input_ids=input_ids, attention_mask=attn_mask)[0]
        x, hidden = self.recurrent(x, self.hidden)
#         self.hidden = tuple([each.data for each in hidden])
        x = self.dense(x)
        y = sentiment.view(-1,1,1).repeat(1,self.max_len,1)
        x = torch.cat((x,y), dim=2)
        x = self.relu(x)
        x = self.lin(x)
        return x

In [None]:
#Weight the loss functions of two different outcomes, start always approaches 0 so end is a bigger target
class BiCELoss:
    def __init__(self):
        self.loss_fn = nn.CrossEntropyLoss()
    def __call__(self, logits, start_act, end_act):
        start,end = logits.permute(2,0,1)
        return 0.8*self.loss_fn(start, start_act) + 1.2*self.loss_fn(end, end_act)

In [None]:
#for testing accuracy on test data
#calculate binary intersection over union
class JaccardAccuracyMeasure:
    def __call__(self, str1, str2):
        s1 = set(str1)
        s2 = set(str2)
        return len(s1.intersection(s2))/float(len(s1.union(s2))+1e-300)

In [None]:
import random
#Standard training loop for pytorch.
def train(epochs, batch_size=100, max_len=35, frac=1.0, device="cuda" if torch.cuda.is_available() else "cpu"):
    tok = DistilBertTokenizer.from_pretrained('/kaggle/input/distillbert-base-cased/hf-distillbert-base-cased')
    ds = TweetaSet("/kaggle/input/tweet-sentiment-extraction/train.csv", tok, max_len, frac)
    dl = DataLoader(ds, batch_size=100, pin_memory=(device=="cuda"))
    model = BERTRNNFineTuner(max_len, (torch.zeros((3,max_len,128)).to(device),torch.zeros((3,max_len,128)).to(device))).to(device)
    loss_fn = BiCELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-2)
    acc = JaccardAccuracyMeasure()
    for _ in range(epochs):
        a = random.randrange(50)
        for i,batch in enumerate(tqdm(dl)):
            optimizer.zero_grad()
            inputs = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["sentiment"].to(device)
            )
            logits = model(*inputs)
            loss = loss_fn(logits,batch["output_mask"]["start"].to(device),batch["output_mask"]["end"].to(device))
            start,end = torch.argmax(F.log_softmax(logits, dim=1),dim=1).squeeze().permute(1,0)
            loss.backward()
            if i%100 == 99:
                tqdm.write("Loss: {:0.5f}".format(loss))
            if i%50 == a:
                str2s = [t.split()[(min(s,len(t.split()))):(min(e,len(t.split())))] for t,s,e in zip(batch["tweet"],start,end)]
                str1s = [b.split()[s:e] for b,s,e in zip(batch["tweet"], batch["output_mask"]["start"], batch["output_mask"]["end"])]
                accs = [acc(*z) for z in zip(str1s,str2s)]
                tqdm.write("Accuracy: {:0.5f}".format(sum(accs)/float(len(accs))))
            optimizer.step()
    model.eval()
    return tok,model


            
    

In [None]:
tokenizer,model = train(30)

In [None]:
#Get Test Data
class TweetaTestSet(Dataset):
    def __init__(self, filename, tokenizer, max_len, frac=1.0):
        self.df = pd.read_csv(filename).dropna().sample(frac=frac)
        self.sentiment_to_idx = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, item):
        textID, text, sentiment = self.df.iloc[item]
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'id': textID,
            'tweet': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiment':self.sentiment_to_idx[sentiment]
        }

In [None]:
#Generate kaggle submission
dl = DataLoader(TweetaTestSet("/kaggle/input/tweet-sentiment-extraction/test.csv", tokenizer, 35), batch_size=10)
df = pd.DataFrame(columns=["textID", "selected_text"])
model=model.to('cpu')
for batch in dl:
    inputs = (
                batch["input_ids"],
                batch["attention_mask"],
                batch["sentiment"]
            )
    logits = model(*inputs)
    start,end = torch.argmax(F.log_softmax(logits, dim=1),dim=1).squeeze().permute(1,0)
    text = [t.split()[(min(s,len(t.split()))):(min(e,len(t.split())))] for t,s,e in zip(batch["tweet"],start,end)]
    df2 = pd.DataFrame(batch["textID"], text, colums = ["textID, selected_text"])
    df = df.append(df2)
df.to_csv("/kaggle/working/submission.csv", index=False)