In [None]:
import os
#use pre-downloaded huggingface transformers as the competition does not allow internet access
os.environ['TRANSFORMERS_CACHE'] = "/kaggle/input/hf-cache/hf"

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel, AdamW
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [None]:
#create a binary mask by finding the substring within the string
def get_mask(text, subtext):
    #split strings into words
    text = text.split()
    subtext = subtext.split()
    
    temp = []
    while len(text) > 0:
        #keep adding zeros to mask until found matching word
        if text[0] != subtext[0]:
            temp.append(0)
            text = text[1:]
        else:
            flag = False
            #compare each word in the substring to the following word. Set flag if they do not match
            for i in range(1,min(len(text),len(subtext))):
                if(text[i] != subtext[i]):
                    flag = True
                    break
            #if flag is not set, the entire substring was found, otherwise
            # add zero to temp and start search again with the next word in the string
            if flag:
                temp.append(0)
                text = text[1:]
            else:
                #there is only one occurance of the masked substring,
                #so once we find it, fill in the rest and return
                temp += [1] * len(subtext) + [0] * (len(text)-len(subtext))
                return temp
    #since we know the substring it always in the string,
    #this will never be hit, but just in case retain the
    #ability to return a 0-only mask
    return temp
    
#Pytorch DataSet
class TweetaSet(Dataset):
    def __init__(self, filename, tokenizer, max_len, frac=1.0):
        #Pandas Dataframe, if only using part of data sample it as such
        self.df = pd.read_csv(filename).dropna().sample(frac=frac)
        #Convert sentiment words into numerical values
        self.sentiment_to_idx = {v:k for k,v in enumerate(self.df.sentiment.unique())}
        #load in the huggingface tokenizer
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, item):
        textID, text, selected_text, sentiment = self.df.iloc[item]
        #create encoding and mask features
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )


        mask = get_mask(text, selected_text)
        mask = mask + [0] * (self.max_len - len(mask))
        return {
            'tweet': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'output_mask': torch.tensor(mask, dtype=torch.int),
            'sentiment':self.sentiment_to_idx[sentiment]
        }
        

In [None]:
#See report for architecture explanations
class MaskDiscriminator(nn.Module):
    def __init__(self, bert, mask_size, dropout_p=0.3):
        super(MaskDiscriminator, self).__init__()
        self.bert = bert
        self.norm_1 = nn.BatchNorm1d(mask_size)
        self.mask = nn.Linear(1+self.bert.config.hidden_size, 1)
        self.drop = nn.Dropout(p=dropout_p)
        self.rect = nn.ReLU()
        self.norm_2 = nn.BatchNorm1d(mask_size*2)
        self.mask_2  = nn.Linear(mask_size*2, 1)
        self.out = nn.Sigmoid()
    def forward(self, input_ids, attention_mask, mask, sentiment):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]
        x_1 = sentiment.unsqueeze(1).repeat(1,x.size(1)).unsqueeze(2)
        x = self.norm_1(x)
        x = torch.cat((x_1,x), dim=2)
        x = self.mask(x).squeeze()
        x = self.rect(x)
        x = self.drop(x)
        x = torch.cat((x,mask),dim=1)
        x = self.norm_2(x)
        x = self.mask_2(x)
        x = self.out(x)
        return x
class MaskGenerator(nn.Module):
    def __init__(self, bert, mask_size, batch_size, dropout_p=0.3):
        super(MaskGenerator, self).__init__()
        self.bert = bert
        self.dense_1 = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.norm_1 = nn.BatchNorm1d(mask_size)
        self.relu = nn.ReLU()
        self.activation = nn.Sigmoid()
        self.rnn = nn.LSTM(1+self.bert.config.hidden_size, 128, 1)
        self.norm_2  = nn.BatchNorm1d(mask_size)
        self.sparse = nn.Linear(128, 1)
        self.dense_2 = nn.Linear(mask_size, mask_size)
        self.hidden = (
            torch.zeros((1,mask_size, 128)).to("cuda" if torch.cuda.is_available() else "cpu"),
            torch.zeros((1,mask_size, 128)).to("cuda" if torch.cuda.is_available() else "cpu")
        )
    def forward(self, input_ids, attention_mask, sentiment):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]
        x = self.norm_1(x)
        x = self.dense_1(x)
        x = self.relu(x)
        x_1 = sentiment.unsqueeze(1).repeat(1,x.size(1)).unsqueeze(2)
        x = torch.cat((x_1,x), dim=2)
        x,self.hidden = self.rnn(x, self.hidden)
        self.hidden = tuple([each.data for each in self.hidden])
        x = self.sparse(x).squeeze()
        x = self.norm_2(x)
        x = self.dense_2(x)
        x = self.activation(x)
        return x
    
#share one bert model amongst both
def generate_models(mask_size, batch_size):
    bert_shared = DistilBertModel.from_pretrained('distilbert-base-cased', local_files_only=True)
    return nn.DataParallel(MaskDiscriminator(bert_shared, mask_size)), nn.DataParallel(MaskGenerator(bert_shared, mask_size, batch_size)), bert_shared.config


In [None]:
#calculate the intersection and union of two masks.
#As a loss function, one vector will be a probabilty 
# array but that still gets the desired result while
# eliminating the discrete-step problem
class JaccardLoss(object):
    def __call__(self,pred, actual):
        intersection =(actual*pred)
        union = ((pred+actual)- intersection)
        intersection= torch.sum(intersection, dim=-1)**2
        union=torch.sum(union, dim=-1)**2
        return 1-(torch.mean((intersection + 1e-6)/(union+1e-6)))**(1/2)
        

In [None]:
from tqdm.auto import tqdm
def train(epochs, dataset, batch_size=16, max_size=40, device='cuda' if torch.cuda.is_available() else 'cpu'):
    torch.autograd.set_detect_anomaly(True)
    device = torch.device(device)
    #Models
    discriminator,generator, config = generate_models(max_size, batch_size)
    discriminator = discriminator.to(device)
    generator.to(device)
    #Optimizers
    generator_optimizer = AdamW(generator.parameters(), lr=0.01)
    discriminator_optimizer = AdamW(discriminator.parameters(), lr=0.001)
    
    #Loss
    classify_loss = JaccardLoss()
    masking_loss = JaccardLoss()
    
    #Data
    data = DataLoader(dataset, batch_size=batch_size, num_workers=4)
    
    #pre-train generator with Adadelta and high learning rate
    pretrain_optim = optim.Adadelta(generator.parameters(), lr=10.0)
    print("Pre-training generator")
    for _ in range(epochs):
        running_loss = 0.0
        ctr = 1.0
        with tqdm(data) as _data:
            for batch in _data:
                pretrain_optim.zero_grad()
                mask = generator(batch['input_ids'].to(device), batch['attention_mask'].to(device),batch['sentiment'].to(device))
                loss = masking_loss(mask,batch['output_mask'].to(device))
                running_loss += loss
                loss.backward()
                pretrain_optim.step()
                _data.set_description("Loss: {:0.5f},".format(loss))
                torch.cuda.empty_cache()
                if(ctr % 10 == 9):
                    tqdm.write("Running Pretrain loss: " + str(float(running_loss / (ctr))))
                ctr += 1
    print("Diversifying")
    #Diversify with Adversarial network
    for _ in range(epochs):
        running_generator_loss = 0.0
        running_descrim_loss = 0.0
        ctr = 1
        with tqdm(data) as _data:
            for batch in _data:
                generator_optimizer.zero_grad()

                gen_mask_using_true = generator(batch['input_ids'].to(device), batch['attention_mask'].to(device),batch['sentiment'].to(device))

                noise = (torch.abs(torch.rand(batch['input_ids'].size(),layout=batch['input_ids'].layout)) * (config.vocab_size-1)).long().to(device)

                gen_attn_mask = torch.randint(0,2,batch['attention_mask'].size()).to(device)
                gen_sentiments = torch.randint(0,3,batch['sentiment'].size()).to(device)
                generated_mask = generator(input_ids=noise, attention_mask=gen_attn_mask,sentiment=gen_sentiments)
                generator_discriminator_out = discriminator(noise, gen_attn_mask, (generated_mask>0.5), gen_sentiments)
                gen_masking_loss = masking_loss(gen_mask_using_true, batch['output_mask'].to(device))
                gen_class_loss   = classify_loss(generator_discriminator_out,torch.ones_like(generator_discriminator_out)) 
                generator_loss = (4*gen_masking_loss + gen_class_loss)/3
                _data.set_description("Loss: {:0.5f}, {:0.5f}, {:0.5f}".format(gen_masking_loss, gen_class_loss, generator_loss))
                generator_loss.backward()
                running_generator_loss += generator_loss
                generator_optimizer.step()

                true_data = (batch['input_ids'].to(device), batch['attention_mask'].to(device),batch['output_mask'].to(device), batch['sentiment'].to(device))


                discriminator_optimizer.zero_grad()
                true_discriminator_out = discriminator(*true_data)
                true_discriminator_loss = classify_loss(true_discriminator_out, torch.ones_like(true_discriminator_out))

                generator_discriminator_out = discriminator(noise, gen_attn_mask, generated_mask.detach(), gen_sentiments)
                generator_discriminator_loss = classify_loss(generator_discriminator_out, torch.zeros_like(generator_discriminator_out))
                discriminator_loss = (2*true_discriminator_loss + generator_discriminator_loss) / 3
                discriminator_loss.backward()
                running_descrim_loss += discriminator_loss
                discriminator_optimizer.step()
                torch.cuda.empty_cache()
                if(ctr % 10 == 9):
                    tqdm.write("Running generator loss: " + str(float(running_generator_loss / (2*ctr))))
                    tqdm.write("Running descriminator loss: " + str(float(running_descrim_loss / (ctr))))
                ctr += 1
                
    generator.eval()
    return generator

In [None]:
#create a class to use the trained model
import numpy as np
class Masker(object):
    def __init__(self, training_data_path, batch_size=16, max_len=100, epochs=10, pretrained_weights=None, frac=1.0):
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased', local_files_only=True)
        ds = TweetaSet(training_data_path, self.tokenizer, max_len, frac=frac)
        self.sentiment_to_ix = ds.sentiment_to_idx
        if(pretrained_weights == None):
            self.generator = train(epochs, ds , max_size=max_len, batch_size=batch_size)
        else:
            bert = DistilBertModel.from_pretrained('distilbert-base-cased', local_files_only=True)
            self.generator = nn.DataParallel(MaskGenerator(bert,max_len))
            self.generator.load_state_dict(torch.load(pretrained_weights, map_location=torch.device('cpu')))
        self.max_len = max_len
        
    def __call__(self, unmasked_strings, sentiments):
        encodings = [self.tokenizer.encode_plus(
            unmasked_string,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        ) for unmasked_string in unmasked_strings]
        encoding = {
            'input_ids':torch.cat([i['input_ids'] for i in encodings]),
            'attention_mask':torch.cat([i['attention_mask'] for i in encodings]),
        }
        mask_input = (
            encoding['input_ids'],
            encoding['attention_mask'],
            torch.Tensor([self.sentiment_to_ix[sentiment] for sentiment in sentiments])
        )
        masks = self.generator(*mask_input)
        m = masks.cpu().detach().numpy()
        print(m)
        print(m>0.5)
        mask_string_pairs = zip(masks,unmasked_strings)
        
        return [" ".join(v for k,v in zip((mask),unmasked_string.split()) if k>=0.5) for mask,unmasked_string in mask_string_pairs]
    def save(self, path):
        torch.save(self.generator.state_dict(), path)

In [None]:
#mask the data
mask = Masker("/kaggle/input/tweet-sentiment-extraction/train.csv", batch_size=100, epochs=5, max_len=35)
#save the gradient weights if desired for re-use later
# mask.save("/kaggle/working/generator_state.dict")
mask

In [None]:
#generate submission
test = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
guess = []
for i in range(0,len(test),100):
    r = slice(i,min(i+100, len(test)))
    guess += mask(test["text"][r], test["sentiment"][r])
test['selected_text'] = guess
test

In [None]:
#write to file
test[["textID","selected_text"]].to_csv("/kaggle/working/submission.csv", index=False)