# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import time
import random
import string
from collections import Counter

from nltk.corpus import stopwords
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, set_seed

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset

import warnings
warnings.filterwarnings("ignore")



def seed_everything(seed = 30):
    set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything()


class cfg:
    
    model = 'bert-base-uncased'
    seed = 16
    
    max_len = 512
    
    
    EPOCHS = 10
    lr = 5e-5
    bs = 8
    
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data Loading and Preprocessing

In [2]:
data = pd.read_csv('/kaggle/input/stumbleupon/train.tsv', sep='\t')
test = pd.read_csv('/kaggle/input/stumbleupon/test.tsv',  sep='\t')

sub = pd.read_csv('/kaggle/input/stumbleupon/sampleSubmission.csv')

In [3]:
def lower_case(data):
    
    data['boilerplate'] = data['boilerplate'].str.lower()
    return data

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


cnt = Counter()
for text in data["boilerplate"].values:
    for word in text.split():
        cnt[word] += 1
        
FREQWORDS = set([w for (w, wc) in cnt.most_common(40)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

def preprocess_boilerplate(df):
    
    df['boilerplate'].replace(to_replace=r'"title":', value="", inplace=True, regex=True)
    df['boilerplate'].replace(to_replace=r'"url":'  , value="", inplace=True, regex=True)
    df['boilerplate'].replace(to_replace=r'"body":' , value="", inplace=True, regex=True)
    
    df = lower_case(df)
    
    df["boilerplate"] = df["boilerplate"].apply(lambda text: remove_punctuation(text))
    df["boilerplate"] = df["boilerplate"].apply(lambda text: remove_stopwords(text))
    df["boilerplate"] = df["boilerplate"].apply(lambda text: remove_freqwords(text))
    
    
    return df


data = preprocess_boilerplate(data)
test = preprocess_boilerplate(test)

In [4]:
def numerical_data_preprocessing(data):

    data['alchemy_category_score'] = pd.to_numeric(data['alchemy_category_score'], errors='coerce', downcast='float')
    data['alchemy_category_score'] = data['alchemy_category_score'].fillna(0.603) #mean of traiining data 
    
    data['is_news'] = pd.to_numeric(data['is_news'], errors='coerce', downcast='float')
    data['is_news'] = data['is_news'].fillna(0.0)
    
    data['news_front_page'] = pd.to_numeric(data['news_front_page'], errors='coerce', downcast='float')
    data['news_front_page'] = data['news_front_page'].fillna(0.5)

    return data

data = numerical_data_preprocessing(data)
test = numerical_data_preprocessing(test)

In [5]:
#Scaling the data

num_cols = list(data.columns[4:-1])

scaler = RobustScaler()
scaler.fit(data[num_cols])

data[num_cols] = scaler.transform(data[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

In [6]:
#Splitting Data for tranning and validation

train, valid = train_test_split(data, test_size = 0.2, random_state = cfg.seed, stratify = data['label'])
train, valid = train.reset_index(), valid.reset_index()

# Organizing Dataset

In [7]:
tokenizer = BertTokenizer.from_pretrained(cfg.model)

def tokenizing(data):
    
    
    encoded_data = tokenizer.batch_encode_plus(list(data.boilerplate.values), 
                                               add_special_tokens=True, 
                                               return_attention_mask=True, 
                                               #pad_to_max_length=True, 
                                               padding='max_length',
                                               max_length=cfg.max_len, 
                                               return_tensors='pt',
                                               truncation=True)
    
    return encoded_data['input_ids'], encoded_data['attention_mask']


def make_Tensor(data, test = False):
    
    
    ids , att = tokenizing(data)
    
    num = data[num_cols].values
    
    if test:
        return ids, att, torch.Tensor(num)
    
    y = data['label'].values
    
    return ids, att, torch.Tensor(num), torch.Tensor(y)
    

    
trn_ids, trn_att, trn_num, trn_y = make_Tensor(train)
vld_ids, vld_att, vld_num, vld_y = make_Tensor(valid)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [8]:
class FastTensorDataLoader:
    """
    A DataLoader-like object for a set of tensors that can be much faster than
    TensorDataset + DataLoader because dataloader grabs individual indices of
    the dataset and calls cat (slow).
    Source: https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6
    """
    def __init__(self, *tensors, batch_size=32, shuffle=False):
        """
        Initialize a FastTensorDataLoader.
        :param *tensors: tensors to store. Must have the same length @ dim 0.
        :param batch_size: batch size to load.
        :param shuffle: if True, shuffle the data *in-place* whenever an
            iterator is created out of this object.
        :returns: A FastTensorDataLoader.
        """
        assert all(t.shape[0] == tensors[0].shape[0] for t in tensors)
        self.tensors = tensors

        self.dataset_len = self.tensors[0].shape[0]
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches
    def __iter__(self):
        if self.shuffle:
            r = torch.randperm(self.dataset_len)
            self.tensors = [t[r] for t in self.tensors]
        self.i = 0
        return self

    def __next__(self):
        if self.i >= self.dataset_len:
            raise StopIteration
        batch = tuple(t[self.i:self.i+self.batch_size] for t in self.tensors)
        self.i += self.batch_size
        return batch

    def __len__(self):
        return self.n_batches
    
    
###########################################################################


trn_loader = FastTensorDataLoader(trn_ids, trn_att, trn_num, trn_y, batch_size = cfg.bs, shuffle = True)
vld_loader = FastTensorDataLoader(vld_ids, vld_att, vld_num, vld_y, batch_size = cfg.bs, shuffle = False)

# Model

In [9]:
class MODEL(nn.Module):

    def __init__(self):
        super(MODEL, self).__init__()
        self.backbone =  BertForSequenceClassification.from_pretrained(cfg.model, num_labels=1)
        self.NumL = nn.Linear(in_features= 22,out_features=4)
        self.out  = nn.Linear(in_features= 5 ,out_features=1)

    def forward(self, input_ids, attention_masks, num):
        
        x = self.backbone(input_ids=input_ids, attention_mask=attention_masks)
        y = torch.nn.functional.relu(self.NumL(num))
        
        z = torch.cat((x['logits'], y), 1)
        z = self.out(z)
        
        return torch.nn.functional.sigmoid(z)

# Trainning

In [10]:
###################################################################################################
def train_func(model, data_loader, criterion, optimizer):
    train_loss = 0.0
    
    model.train()
    for ids, att, num, y in data_loader:
        
        ids = ids.to(device)
        att = att.to(device)
        num = num.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        
        y_preds = model(ids, att, num).squeeze()
        
        loss = criterion(y_preds, y)
        loss.backward()
        
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.005)
        
        optimizer.step()
        train_loss += loss.item()

        
    return train_loss / len(data_loader)
###################################################################################################

###################################################################################################    
def valid_func(model, data_loader, criterion):
    valid_preds =  []
    valid_loss = 0.0
    
    model.eval()
    for ids, att, num, y in data_loader:
        
        ids = ids.to(device)
        att = att.to(device)
        num = num.to(device)
        y = y.to(device)
        
        with torch.no_grad():
            y_preds = model(ids, att, num).squeeze()
        
        loss = criterion(y_preds, y)
        valid_loss += loss.item()
        
        valid_preds.append(y_preds.to('cpu').detach().numpy())
        
    return valid_loss / len(data_loader), np.concatenate(valid_preds)
###################################################################################################

In [11]:
def start_training():
    
    model = MODEL()                
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr = cfg.lr)
    
    
    criterion = nn.BCELoss().to(device)
    
    
    #-----------------------------------------------------------------#
    train_losses = []
    valid_losses = []
    
    for epoch in range(cfg.EPOCHS):
        start_time = time.time()
        
        train_loss = train_func(model, trn_loader, criterion, optimizer)
        valid_loss, valid_prediction = valid_func(model, vld_loader, criterion)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)      

        
        
        torch.save(model.state_dict(), f'm_{epoch}.pth')
        time_taken = time.time() - start_time
        print('Epoch {:2d} | loss: {:.4f}  | val_Loss: {:.4f} | {:d}s'.
          format(epoch, train_loss, valid_loss, int(time_taken)))
        
    return train_losses, valid_losses, valid_prediction

In [12]:
train_losses, valid_losses, valid_prediction = start_training()

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch  0 | loss: 0.7078  | val_Loss: 0.6963 | 364s
Epoch  1 | loss: 0.6978  | val_Loss: 0.6839 | 363s
Epoch  2 | loss: 0.6558  | val_Loss: 0.6821 | 363s
Epoch  3 | loss: 0.6132  | val_Loss: 0.6706 | 363s
Epoch  4 | loss: 0.5911  | val_Loss: 0.6504 | 363s
Epoch  5 | loss: 0.5110  | val_Loss: 0.6354 | 363s
Epoch  6 | loss: 0.5701  | val_Loss: 0.6468 | 363s
Epoch  7 | loss: 0.5563  | val_Loss: 0.6059 | 363s
Epoch  8 | loss: 0.4455  | val_Loss: 0.6035 | 363s
Epoch  9 | loss: 0.5577  | val_Loss: 0.5919 | 363s


In [13]:
#Epoch  4 | loss: 0.4837  | val_Loss: 0.5030 | 365s

In [14]:
from sklearn.metrics import roc_auc_score
roc_auc_score(valid['label'], valid_prediction)

0.8046166373883766