In [None]:
%%capture
!pip install transformers
!pip install pytorch-transformers
!pip install kaggle
!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!ls "/content/drive/MyDrive/Deep_Learning/NLP_Vol3/Part_2/"

kaggle.json  Part_1.ipynb  Part_2.ipynb


In [None]:
!mkdir ~/.kaggle

In [None]:
!cp "/content/drive/MyDrive/Deep_Learning/NLP_Vol3/Part_2/kaggle.json" "/root/.kaggle"

In [None]:
!ls /root/.kaggle

kaggle.json


In [None]:
%%capture
!kaggle competitions download -c tweet-sentiment-extraction

In [None]:
import tokenizers
import torch 
import transformers

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import os
import zipfile
import gc
import string
from tqdm import tqdm

from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
zip_file = zipfile.ZipFile("/content/tweet-sentiment-extraction.zip",mode = "r")
zip_file.extractall("./")
zip_file.close()

In [None]:
df_train = pd.read_csv("/content/train.csv")
df_train.head(2)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [None]:
df_test = pd.read_csv("/content/test.csv")
df_test.head(2)

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive


In [None]:
sample = pd.read_csv("/content/sample_submission.csv")
sample.head(2)

Unnamed: 0,textID,selected_text
0,f87dea47db,
1,96d74cb729,


In [None]:
def jaccard(str1,str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 10
BERT_PATH = "bert-base-uncased"
MODEL_PATH = "model.bin"
TRAINING_FILE = "/content/train.csv"

TOKENIZER = tokenizers.BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)

In [None]:
## Same Length !!! 

#len(TOKENIZER.get_vocab()),len(transformers.BertTokenizer.from_pretrained("bert-base-uncased").get_vocab())

In [None]:
class TweetDataset:
    def __init__(self,tweet, sentiment, selected_text, info):
        self.tweet = tweet 
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.max_len = MAX_LEN
        self.tokenizer = TOKENIZER
        self.info = info

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self,item):
        tweet = " ".join(str(self.tweet[item]).split())
        selected_text = " ".join(str(self.selected_text[item]).split())

        len_sel_text = len(selected_text)
        idx0 = -1
        idx1 = -1

        # Burada yapılan işlem her kelimenin harfine bakılarak onaylıyor !!!!
        for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
            if tweet[ind: ind + len_sel_text] == selected_text:
                idx0 = ind
                idx1 = ind + len_sel_text - 1
                break

        char_targets = [0] * len(tweet)
        # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,....]
        
        if idx0 != -1 and idx1 != -1:
            for j in range(idx0,idx1+1):
                if tweet[j] != " ":
                    char_targets[j] = 1
        
        # [0,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,...]
        
        tok_tweet = self.tokenizer.encode(tweet)
        tok_tweet_tokens = tok_tweet.tokens # ["Hi","How",...]
        tok_tweet_ids = tok_tweet.ids # [12,32,312, ...]
        tok_tweet_offsets = tok_tweet.offsets #( (0,2),(2,5),...,(?,?) )

        if self.info:
            print("Tweet : ", self.tweet[item])
            print("Selected_Tweet : ", self.selected_text[item])
            print()
            print("Tokens : ",tok_tweet_tokens)
            print("Tokens Len : ", len(tok_tweet_tokens))
            print("IDS : ", tok_tweet_ids)
            print("IDS Len : ", (len(tok_tweet_ids)))
            print("OffSets : ",tok_tweet_offsets)
            print("OffSets Len : ",len(tok_tweet_offsets))
            print()

        # Burada ise Kelime kelime bakılıp onaylanıyor !!!

        targets = [0] * (len(tok_tweet_tokens)-2) # include cls and sep !! discard this two tokens
        for j, (offset1,offset2) in enumerate(tok_tweet_offsets[1:-1]):
            if sum(char_targets[offset1:offset2]) > 0:
                targets[j] = 1

        targets = [0] + targets + [0] # cls , sep

        if self.info:        
            print("Char Targets : ", char_targets)
            print("Targets : ",targets)
            print("Len Char : {} <---> Target : {}".format(len(char_targets),len(targets)) )
            print()

        targets_start = [0] * len(targets)
        targets_end = [0] * len(targets)

        non_zero = np.nonzero(targets)[0]
        if len(non_zero) > 0:
            targets_start[non_zero[0]] = 1
            targets_end[non_zero[-1]] = 1

        if self.info:
            print("Targets Start : ",targets_start )
            print("Targets End   : ",targets_end )

        mask = [1] * len(tok_tweet_ids)
        token_type_ids = [0] * len(tok_tweet_ids)

        padding_len = self.max_len - len(tok_tweet_ids)

        ids = tok_tweet_ids + [0] * padding_len
        mask = mask + [0] * padding_len
        token_type_ids = token_type_ids + [0] * padding_len
        targets = targets + [0] * padding_len
        targets_start = targets_start + [0] * padding_len
        targets_end = targets_end + [0] * padding_len

        sentiment = [1,0,0]

        if self.sentiment[item] == "positive":
            sentiment = [0,0,1]
        if self.sentiment[item] == "negative":
            sentiment = [0,1,0]

        return    {  
                     "ids" : torch.tensor(ids, dtype = torch.long),
                     "mask" : torch.tensor(mask, dtype = torch.long),
                     "token_type_ids" : torch.tensor(token_type_ids, dtype = torch.long),
                     "targets" : torch.tensor(targets, dtype = torch.long),
                     "targets_start" : torch.tensor(targets_start, dtype = torch.long),
                     "targets_end" : torch.tensor(targets_end, dtype = torch.long),
                     "padding_len" : torch.tensor(padding_len, dtype = torch.long),
                     "tweet_tokens" : " ".join(tok_tweet_tokens),
                     "orig_tweet" : self.tweet[item],
                     "sentiment" : torch.tensor(sentiment, dtype = torch.long),
                     "orig_sentiment" : self.sentiment[item],
                     "orig_selected" : self.selected_text[item]
                    }

if __name__ == "__main__":
    df = pd.read_csv(TRAINING_FILE).dropna().reset_index(drop = True)
    dset = TweetDataset(
        tweet = df.text.values,
        sentiment = df.sentiment.values,
        selected_text = df.selected_text.values,
        info = True
    )
    result = dset[0]
    print("\n",result)

Tweet :   I`d have responded, if I were going
Selected_Tweet :  I`d have responded, if I were going

Tokens :  ['[CLS]', 'i', '`', 'd', 'have', 'responded', ',', 'if', 'i', 'were', 'going', '[SEP]']
Tokens Len :  12
IDS :  [101, 1045, 1036, 1040, 2031, 5838, 1010, 2065, 1045, 2020, 2183, 102]
IDS Len :  12
OffSets :  [(0, 0), (0, 1), (1, 2), (2, 3), (4, 8), (9, 18), (18, 19), (20, 22), (23, 24), (25, 29), (30, 35), (0, 0)]
OffSets Len :  12

Char Targets :  [1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
Targets :  [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
Len Char : 35 <---> Target : 12

Targets Start :  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Targets End   :  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]

 {'ids': tensor([ 101, 1045, 1036, 1040, 2031, 5838, 1010, 2065, 1045, 2020, 2183,  102,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,

In [None]:
class BERTBaseUncased(torch.nn.Module):
    def __init__(self):
        super(BERTBaseUncased,self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.l0 = torch.nn.Linear(768,2)

    def forward(self, ids, mask, token_type_ids):
        result = self.bert(
            ids,
            attention_mask = mask,
            token_type_ids = token_type_ids
        )
        # ['last_hidden_state', 'pooler_output']

        # batch_size , max_len , 768 --- > batch_size , max_len , 2 !!!
        logits = self.l0(result["last_hidden_state"])

        start_logits , end_logits = logits.split(1,dim = -1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        
        return start_logits,end_logits

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
encoding = tokenizer.encode_plus(
    df_train.text[0],
    max_length = MAX_LEN,
    padding = "max_length",
    return_tensors = "pt",
    truncation = True,
    add_special_tokens = True
)

model = BERTBaseUncased()
o1,o2 = model(
    ids = encoding["input_ids"],
    mask = encoding["attention_mask"],
    token_type_ids = encoding["token_type_ids"]
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
o1.shape,o2.shape,torch.cat((o1,o2),dim = 0).shape

(torch.Size([1, 128]), torch.Size([1, 128]), torch.Size([2, 128]))

In [None]:
stack = np.vstack([np.ones((5,128)),np.ones((5,128))])
stack.shape

(10, 128)

In [None]:
def loss_fn(o1,o2,t1,t2):
    l1 = torch.nn.BCEWithLogitsLoss()(o1,t1)
    l2 = torch.nn.BCEWithLogitsLoss()(o2,t2)
    return l1 + l2

def train_fn(data_loader, model, optimizer, device, schedular):

    model.train()
    losses = AverageMeter()
    tk0 = tqdm(data_loader, total = len(data_loader))
    
    for bi,d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]

        ids = ids.to(device, dtype = torch.long)
        token_type_ids = token_type_ids.to(device, dtype = torch.long)
        mask = mask.to(device,dtype = torch.long)
        targets_start = targets_start.to(device,dtype = torch.float)
        targets_end = targets_end.to(device,dtype = torch.float)

        optimizer.zero_grad()
        outputs1,outputs2 = model(
            ids = ids,
            mask = mask,
            token_type_ids = token_type_ids
        )

        loss = loss_fn(outputs1, outputs2, targets_start, targets_end)

        loss.backward()
        optimizer.step()
        schedular.step()

        losses.update(loss,ids.size(0))
        tk0.set_postfix(loss = losses.avg.item())


def eval_fn(data_loader, model , device):

    model.eval()
    fin_output_start =[]
    fin_output_end = []
    fin_padding_lens = []
    fin_tweet_tokens = []
    fin_orig_sentiment = []
    fin_orig_selected = []
    fin_orig_tweet = []


    for bi, d in enumerate(data_loader):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        tweet_tokens = d["tweet_tokens"]
        padding_len = d["padding_len"]
        orig_sentiment = d["orig_sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]

        ids = ids.to(device, dtype = torch.long )
        token_type_ids = token_type_ids.to(device, dtype = torch.long )
        mask = mask.to(device, dtype = torch.long )

        output1, output2 = model(
            ids = ids,
            mask = mask,
            token_type_ids = token_type_ids
        )

        fin_output_start.append(torch.sigmoid(output1).cpu().detach().numpy())
        fin_output_end.append(torch.sigmoid(output2).cpu().detach().numpy())
        fin_padding_lens.extend(padding_len.cpu().detach().numpy().tolist())

        fin_tweet_tokens.extend(tweet_tokens)
        fin_orig_sentiment.extend(orig_sentiment)
        fin_orig_selected.extend(orig_selected)
        fin_orig_tweet.extend(orig_tweet)

    fin_output_start = np.vstack(fin_output_start)
    fin_output_end = np.vstack(fin_output_end)

    threshold = .2
    jaccards = []
    for j in range(len(fin_tweet_tokens)):
        target_string = fin_orig_selected[j]
        tweet_tokens = fin_tweet_tokens[j]
        padding_len = fin_padding_lens[j]
        original_tweet = fin_orig_tweet[j]
        sentiment = fin_orig_sentiment[j]

        if padding_len > 0:
            # İlkten sıradan yakalamaya başlıyor !!!
            mask_start = fin_output_start[j,:][:-padding_len] >= threshold
            mask_end = fin_output_end[j,:][:-padding_len] >= threshold
        else:
            mask_start = fin_output_start[j,:]>= threshold
            mask_end = fin_output_end[j,:] >= threshold

        mask = [0] * len(mask_start)
        idx_start= np.nonzero(mask_start)[0]
        idx_end = np.nonzero(mask_end)[0]

        if len(idx_start) > 0:
            idx_start = idx_start[0]
            if len(idx_end) > 0:
                idx_end = idx_end[0]
            else:
                idx_end = idx_start

        else:
            idx_start = 0
            idx_end = 0

        for mj in range(idx_start, idx_end + 1):
            mask[mj] = 1

        output_tokens = [x for p,x in enumerate(tweet_tokens.split()) if mask[p] == 1]
        output_tokens = [x for x in output_tokens if x not in ("[CLS]", "[SEP]")]

        final_output = ""
        for ot in output_tokens:
            if ot.startswith("##"):
                final_output = final_output + ot[2:]
            elif len(ot) == 1 and ot in string.punctuation:
                final_output = final_output + ot
            else:
                final_output = final_output + " " + ot
        
        final_output = final_output.strip()

        if sentiment == "neutal" or len(original_tweet.split()) < 4:
            final_output = original_tweet

        jac = jaccard(target_string.strip(), final_output.strip())
        jaccards.append(jac)

    mean_jac = np.mean(jaccards)
    return mean_jac

In [None]:
def run():
    dfx = pd.read_csv(TRAINING_FILE).dropna().reset_index(drop = True)

    df_train, df_valid = train_test_split(
        dfx,
        test_size = 0.1,
        random_state = 42,
        stratify = dfx.sentiment.values
    )

    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)


    train_dataset = TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text = df_train.selected_text.values, 
        info = False
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = TRAIN_BATCH_SIZE,
        num_workers = 4
    )
    
    # !!! important part !!!
    valid_dataset = TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text = df_valid.selected_text.values, 
        info = False
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = VALID_BATCH_SIZE,
        num_workers = 4
    )

    device = "cuda" if torch.cuda.is_available() else "cpu" 

    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias","LayerNorm.bias","LayerNorm.weight"]
    optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = transformers.AdamW(optimizer_parameters, lr = 3e-5)
    schedular = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = 0,
        num_training_steps = num_train_steps
    )

    model = torch.nn.DataParallel(model)

    best_jaccard = 0
    for epoch in range(EPOCHS):
        train_fn(train_data_loader,model,optimizer,device,schedular)
        jaccard = eval_fn(valid_data_loader, model, device)
        jaccard = np.round(jaccard,4)
        print(f"Jaccard Score : {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), MODEL_PATH)
            best_jaccard = jaccard

if __name__ == "__main__":
    run()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 773/773 [05:01<00:00,  2.56it/s, loss=0.042]


Jaccard Score : 0.4871


100%|██████████| 773/773 [04:54<00:00,  2.63it/s, loss=0.0261]


Jaccard Score : 0.5055


100%|██████████| 773/773 [04:53<00:00,  2.63it/s, loss=0.0234]


Jaccard Score : 0.5168


100%|██████████| 773/773 [04:55<00:00,  2.61it/s, loss=0.0206]


Jaccard Score : 0.5214


100%|██████████| 773/773 [04:54<00:00,  2.63it/s, loss=0.0177]


Jaccard Score : 0.5257


  1%|          | 8/773 [00:13<21:47,  1.71s/it, loss=0.0178]


KeyboardInterrupt: ignored

# Test Data

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu" 

MODEL = torch.nn.DataParallel(BERTBaseUncased())
MODEL.load_state_dict(torch.load("/content/drive/MyDrive/extract_model.bin"))
MODEL.to(device)
MODEL.eval()

In [None]:
df_test = pd.read_csv("/content/test.csv")
df_test.loc[:,"selected_text"] = df_test.text.values

test_dataset = TweetDataset(
    tweet = df_test.text.values,
    sentiment = df_test.sentiment.values,
    selected_text = df_test.selected_text.values,
    info = False
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle = False,
    batch_size = VALID_BATCH_SIZE,
    num_workers = 1
)

In [None]:
print(next(iter(test_data_loader)))