In [None]:
%%capture
!pip install transformers
!pip install pytorch-transformers
!pip install kaggle

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!ls "/content/drive/MyDrive/Deep_Learning/NLP_Vol3/Part_2/"

kaggle.json  Part_1.ipynb  Part_2.ipynb


In [None]:
!mkdir ~/.kaggle

In [None]:
!cp "/content/drive/MyDrive/Deep_Learning/NLP_Vol3/Part_2/kaggle.json" "/root/.kaggle"

In [None]:
!ls /root/.kaggle

kaggle.json


In [None]:
import torch
import numpy as np
import pandas as pd
import transformers
import os
import zipfile
import scipy

from sklearn.model_selection import train_test_split

import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

from warnings import filterwarnings 
filterwarnings("ignore")

In [None]:
api = KaggleApi()
api.authenticate()

In [None]:
!kaggle competitions download -c google-quest-challenge

Downloading google-quest-challenge.zip to /content
100% 4.85M/4.85M [00:00<00:00, 14.3MB/s]
100% 4.85M/4.85M [00:00<00:00, 14.3MB/s]


In [None]:
zip_file = zipfile.ZipFile("/content/google-quest-challenge.zip", mode = "r")
zip_file.extractall("./")
zip_file.close()

In [None]:
transformers.BertModel.from_pretrained("bert-base-uncased")

In [None]:
import gc

torch.cuda.empty_cache()
gc.collect()
!sudo kill -9 <pid>

/bin/bash: -c: line 0: syntax error near unexpected token `newline'
/bin/bash: -c: line 0: `sudo kill -9 <pid>'


In [None]:
transformers.logging.set_verbosity_error()

In [None]:
class BERTBaseUnCased(torch.nn.Module):
    def __init__(self,bert_path):
        super(BERTBaseUnCased,self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = torch.nn.Dropout(p = 0.3)
        self.output = torch.nn.Linear(768,30)

    def forward(self, ids, mask, token_type_ids):
        result = self.bert(ids, attention_mask = mask , token_type_ids = token_type_ids)
        o2 = result["pooler_output"]
        bo = self.bert_drop(o2)
        return self.output(bo)

# --------------------------------------------------------- #

class BertDatasetTraining:
    def __init__(self,qtitle, qbody, answer, targets ,tokenizer , max_len):
        self.qtitle = qtitle 
        self.qbody = qbody
        self.answer = answer

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.targets = targets

    def __len__(self):
        return len(self.answer)

    def __getitem__(self,item):
        
        question_title = str(self.qtitle[item])
        question_body = str(self.qbody[item])
        answer = str(self.answer[item])

        #[CLS] [Q-TITLE] [Q-BODY] [SEP] [ANSWER] [SEP]

        title_body = question_title + " " + question_body
        
        inputs = self.tokenizer.encode_plus(
            title_body,
            answer,
            add_special_tokens = True,
            max_length = self.max_len,
            truncation = True,
            padding = "max_length",
            return_tensors = "pt"
            )

        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        #padding_len = self.max_len - len(ids)
        #ids = ids + ([0] * padding_len)
        #token_tpye_ids = token_type_ids + ([0] * padding_len)
        #mask = mask + ([0] * padding_len)

        return {
            "ids" : torch.tensor(ids, dtype = torch.long).flatten(),
            "mask" : torch.tensor(mask , dtype = torch.long).flatten(),
            "token_type_ids" : torch.tensor(token_type_ids, dtype = torch.long).flatten(),
            "targets" : torch.tensor(self.targets[item, : ], dtype = torch.float)
        }


def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs,targets)

def train_loop_fn(data_loader, model, optimizer, device, scheduler = None):
    model.train()

    for bi,d in enumerate(data_loader):

        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]
        targets = d["targets"]

        ids = ids.to(device , dtype = torch.long)
        mask = mask.to(device , dtype = torch.long)
        token_type_ids = token_type_ids.to(device , dtype = torch.long)
        targets = targets.to(device , dtype = torch.float)

        optimizer.zero_grad()
        outputs = model(ids = ids, mask = mask , token_type_ids = token_type_ids)
        loss = loss_fn(outputs,targets)
        loss.backward()
        optimizer.step()
        
        if scheduler is not None:
            scheduler.step()

        if bi % 10 == 0:
            print(f"Bi : {bi}, Loss : {loss}")


def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []

    for bi,d in enumerate(data_loader):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]
        targets = d["targets"]

        ids = ids.to(device , dtype = torch.long)
        mask = mask.to(device , dtype = torch.long)
        token_type_ids = token_type_ids.to(device , dtype = torch.long)
        targets = targets.to(device , dtype = torch.float)

        outputs = model(ids = ids, mask = mask , token_type_ids = token_type_ids)
        loss = loss_fn(outputs,targets)

        fin_targets.append(targets.cpu().detach().numpy())
        fin_outputs.append(outputs.cpu().detach().numpy())

    # !!!! 4, > 4,1
    return np.vstack(fin_outputs), np.vstack(fin_targets)

def run():
    MAX_LEN = 512
    TRAIN_BATCH_SIZE = 4
    EPOCHS = 20

    dfx = pd.read_csv("/content/train.csv",).fillna("none")
    #dfx = dfx.iloc[:dfx.shape[0] // 2, :]
    df_train, df_valid = train_test_split(dfx, random_state = 42, test_size = 0.1)
    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)

    sample = pd.read_csv("/content/sample_submission.csv")
    target_cols = list(sample.drop("qa_id",axis = 1).columns)
    train_targets = df_train[target_cols].values
    valid_targets = df_valid[target_cols].values

    tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

    train_dataset = BertDatasetTraining(
        qtitle = df_train.question_title.values,
        qbody = df_train.question_body.values,
        answer = df_train.answer.values,
        targets = train_targets,
        tokenizer = tokenizer,
        max_len = MAX_LEN
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = TRAIN_BATCH_SIZE,
        shuffle = True,
    )

    valid_dataset = BertDatasetTraining(
        qtitle = df_valid.question_title.values,
        qbody = df_valid.question_body.values,
        answer = df_valid.answer.values,
        targets = valid_targets,
        tokenizer = tokenizer,
        max_len = MAX_LEN
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = TRAIN_BATCH_SIZE,
        shuffle = False,
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    lr = 3e-5
    num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
    
    model = BERTBaseUnCased("bert-base-uncased").to(device)
    optimizer = transformers.AdamW(model.parameters(), lr = lr)

    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = 0,
        num_training_steps = num_train_steps
    )

    for epoch in range(EPOCHS):
        train_loop_fn(train_data_loader, model, optimizer, device, scheduler)
        o,t = eval_loop_fn(valid_data_loader,model,device)

        # (304,30),(304,30)
        print(o.shape,t.shape)

        spear = []
        for jj in range(t.shape[1]):
            p1 = list(t[:,jj])
            p2 = list(t[:,jj])
            coef, _ = np.nan_to_num(scipy.stats.spearmanr(p1,p2))
            spear.append(coef)

        spear = np.mean(spear)

        print(f"Epoch : {epoch}, Spearman : {spear}")    

if __name__ == "__main__":
    run()

Bi : 0, Loss : 0.7127634882926941
Bi : 10, Loss : 0.6542773842811584
Bi : 20, Loss : 0.5719514489173889
Bi : 30, Loss : 0.520977795124054
Bi : 40, Loss : 0.49720150232315063
Bi : 50, Loss : 0.4925438463687897
Bi : 60, Loss : 0.48367059230804443
Bi : 70, Loss : 0.4504952132701874
Bi : 80, Loss : 0.42435577511787415
Bi : 90, Loss : 0.48770588636398315
Bi : 100, Loss : 0.40003207325935364
Bi : 110, Loss : 0.4252758324146271
Bi : 120, Loss : 0.40870994329452515
Bi : 130, Loss : 0.44256091117858887
Bi : 140, Loss : 0.42773088812828064
Bi : 150, Loss : 0.46819859743118286
Bi : 160, Loss : 0.42457354068756104
Bi : 170, Loss : 0.42385485768318176
Bi : 180, Loss : 0.36850854754447937
Bi : 190, Loss : 0.41479921340942383
Bi : 200, Loss : 0.40132632851600647
Bi : 210, Loss : 0.43570399284362793
Bi : 220, Loss : 0.4547490179538727
Bi : 230, Loss : 0.39091554284095764
Bi : 240, Loss : 0.4304036498069763
Bi : 250, Loss : 0.3620443046092987
Bi : 260, Loss : 0.45431798696517944
Bi : 270, Loss : 0.4189

KeyboardInterrupt: ignored

In [None]:
t,o = np.random.randn(304,30),np.random.randn(304,30)
t.shape,o.shape

((304, 30), (304, 30))

In [None]:
spear = []
for jj in range(int(t.shape[1]/10)):
    p1 = list(t[:,jj])
    p2 = list(t[:,jj])
    print(np.array(p1).shape,np.array(p2).shape)
    print(scipy.stats.spearmanr(p1,p2))
    print(np.nan_to_num(scipy.stats.spearmanr(p1,p2)))
    coef, _ = np.nan_to_num(scipy.stats.spearmanr(p1,p2))
    spear.append(coef)
    print()

spear = np.mean(spear)

(304,) (304,)
SpearmanrResult(correlation=1.0, pvalue=0.0)
[1. 0.]

(304,) (304,)
SpearmanrResult(correlation=1.0, pvalue=0.0)
[1. 0.]

(304,) (304,)
SpearmanrResult(correlation=1.0, pvalue=0.0)
[1. 0.]

