# Quora Question Pair

In [14]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()

    print('There are %d GPU(s) available.' % n_gpu)

    print('We will use the GPU:', [torch.cuda.get_device_name(i) for i in range(n_gpu)])

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: ['GeForce GTX 1050 Ti']


## all required package

In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#import bert tokenizer
from transformers import  BertTokenizer
#import bert classification for finetuning
from transformers import BertForSequenceClassification
# import adamw optimizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

import time
import datetime
import random
from tqdm import tqdm

In [16]:
train_data = pd.read_csv("./data/train.csv", index_col="id",nrows=1000)
train_data.head(6)

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1


In [79]:
test_data = pd.read_csv("./data/test.csv", index_col="test_id",nrows=1000)
test_data.head()

Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,What but is the best way to send money from Ch...,What you send money to China?
3,Which food not emulsifiers?,What foods fibre?
4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [22]:
# train_validation data split
X_train, X_val, y_train, y_val = train_test_split(train_data[["question1", "question2"]], 
                                                    train_data["is_duplicate"], test_size=0.2, random_state=405633)
X_train.head()

Unnamed: 0_level_0,question1,question2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
330,"How do I see ""sent invitations"" on Linkedin if...",How can you personalize a LinkedIn invitation?...
169,How do you make life suit you and stop life fr...,Why are emotionally abusive people in my life?...
419,Why can't I stop watching porn?,Why should / shouldn't I watch porn?
849,How do I lose weight without doing any sport?,How do I lose weight without doing exercise or...
651,Which payment gateway in Saudi that using to c...,"Which are all the stress free, relatively easy..."


## convert data to Bert input

In [23]:
# load bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#calculate the maximum sentence length
max_len  = 0
for _, row in train_data.iterrows():
    max_len = max(max_len, len(tokenizer(row['question1'],row['question2'])["input_ids"]))

print("max token length of the input:", max_len)
    
# set the maximum token length
max_length = pow(2,int(np.log2(max_len)-1))
print("max token length for BERT:", max_length)

max token length of the input: 92
max token length for BERT: 32


In [40]:
# func to convert data to bert input
def convert_to_dataset_torch(data: pd.DataFrame, labels = pd.Series(data=None)) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["question1"], row["question2"], max_length=max_length, pad_to_max_length=True, 
                      return_attention_mask=True, return_tensors='pt', truncation=True)
        # Add the encoded sentences to the list.
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])
    
    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    if labels.empty:
        return TensorDataset(input_ids, attention_masks, token_type_ids)
    else:
        labels = torch.tensor(labels.values)
        return TensorDataset(input_ids, attention_masks, token_type_ids, labels)

In [41]:
train = convert_to_dataset_torch(data=X_train, labels=y_train)
validation = convert_to_dataset_torch(data=X_val, labels= y_val)

100%|██████████████████████████████████████████████████████████████████████████████| 800/800 [00:00<00:00, 1069.24it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 1205.36it/s]


## train

In [42]:
# set batch size for DataLoader(options from paper:16 or 32)
batch_size = 8

# Create the DataLoaders for training and validation sets
train_dataloader = DataLoader(
            train,  
            sampler = RandomSampler(train), # Select batches randomly
            batch_size = batch_size 
        )

# For validation
validation_dataloader = DataLoader(
            validation, 
            sampler = SequentialSampler(validation), # Pull out batches sequentially.
            batch_size = batch_size 
        )

In [43]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels=2, # The number of output labels--2  
    output_attentions=False, # Whether returns attentions weights.
    output_hidden_states=False, # Whether returns all hidden-states.
)
model.cuda()
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [44]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate
                  eps = 1e-8 # args.adam_epsilon
                )

In [45]:
# Number of training epochs
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [46]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [47]:
def fit_batch(dataloader, model, optimizer, epoch):
    total_train_loss = 0
    
    for batch in tqdm(dataloader, desc=f"Training epoch:{epoch+1}", unit="batch"):
        # Unpack batch from dataloader.
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        labels = batch[3].to(device)
        
        # clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        
        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(input_ids, 
                        token_type_ids=token_type_ids, 
                        attention_mask=attention_masks, 
                        labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # normlization of the gradients to 1.0 to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        
    return total_train_loss

In [48]:
def eval_batch(dataloader, model, metric=accuracy_score):
    total_eval_accuracy = 0
    total_eval_loss = 0
    predictions , predicted_labels = [], []
    
    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
        # Unpack batch from dataloader.
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        labels = batch[3].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            outputs = model(input_ids, 
                            token_type_ids=token_type_ids, 
                            attention_mask=attention_masks,
                            labels=labels)
            loss = outputs[0]
            logits = outputs[1]
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of validation sentences, and
        # accumulate it over all batches.
        y_pred = np.argmax(logits, axis=1).flatten()
        total_eval_accuracy += metric(label_ids, y_pred)
        
        predictions.extend(logits.tolist())
        predicted_labels.extend(y_pred.tolist())
    
    return total_eval_accuracy, total_eval_loss, predictions ,predicted_labels

In [49]:
def train(train_dataloader, validation_dataloader, model, optimizer, epochs):
    # We'll store a number of quantities such as training and validation loss, 
    # validation accuracy, and timings.
    training_stats = []
    
    # Measure the total training time for the whole run.
    total_t0 = time.time()
    
    for epoch in range(0, epochs):
        # Measure how long the training epoch takes.
        t0 = time.time()
        
        # Reset the total loss for this epoch.
        total_train_loss = 0
        
        # Put the model into training mode. 
        model.train()
        
        total_train_loss = fit_batch(train_dataloader, model, optimizer, epoch)
        
        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)
        
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)
        
        t0 = time.time()
        
        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()
        

        total_eval_accuracy, total_eval_loss, _, _ = eval_batch(validation_dataloader, model)
        
        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("\n")
        print(f"score: {avg_val_accuracy}")
    
        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)
    
        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)
    
        print(f"Validation Loss: {avg_val_loss}")
        print("\n")
    
        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. score.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )
        

    print("")
    print("Training complete!")

    print(f"Total training took {format_time(time.time()-total_t0)}")
    return training_stats

In [65]:
def predict(dataloader, model):
    prediction = list()
    
    for batch in tqdm(dataloader, desc="predicting", unit="batch"):
        # Unpack batch from dataloader.
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            outputs = model(input_ids, 
                            token_type_ids=token_type_ids, 
                            attention_mask=attention_masks)
        logits = outputs[0]
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        
        prediction.append(logits)
        
    pred_logits = np.concatenate(prediction, axis=0)
    pred_label = np.argmax(pred_logits, axis=1).flatten()
    print("done")
    return (pred_label,pred_logits)

In [51]:
# Set the seed value all over the place to make this reproducible.
seed_val = 405633

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed_val)

training_stats = train(train_dataloader, validation_dataloader, model, optimizer, epochs)

Training epoch:1: 100%|███████████████████████████████████████████████████████████| 100/100 [00:23<00:00,  4.32batch/s]
Evaluating: 100%|███████████████████████████████████████████████████████████████████| 25/25 [00:01<00:00, 23.91batch/s]
Training epoch:2:   1%|▌                                                            | 1/100 [00:00<00:18,  5.45batch/s]



score: 0.69
Validation Loss: 0.547390798330307




Training epoch:2: 100%|███████████████████████████████████████████████████████████| 100/100 [00:22<00:00,  4.46batch/s]
Evaluating: 100%|███████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 27.34batch/s]
Training epoch:3:   1%|▌                                                            | 1/100 [00:00<00:17,  5.60batch/s]



score: 0.745
Validation Loss: 0.5127146875858307




Training epoch:3: 100%|███████████████████████████████████████████████████████████| 100/100 [00:22<00:00,  4.52batch/s]
Evaluating: 100%|███████████████████████████████████████████████████████████████████| 25/25 [00:01<00:00, 23.08batch/s]



score: 0.745
Validation Loss: 0.6192037010192871



Training complete!
Total training took 0:01:11





In [53]:
df_stats = pd.DataFrame(training_stats).set_index('epoch')
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. score.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.623769,0.547391,0.69,0:00:23,0:00:01
1,0.44035,0.512715,0.745,0:00:22,0:00:01
2,0.307956,0.619204,0.745,0:00:22,0:00:01


# Predition for test set

In [80]:
# Create the DataLoader for test data.
prediction_data = convert_to_dataset_torch(test_data)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1155.99it/s]


In [81]:
y_pred,logits = predict(prediction_dataloader,model)

predicting: 100%|█████████████████████████████████████████████████████████████████| 125/125 [00:04<00:00, 28.77batch/s]

done





In [82]:
prob = torch.nn.functional.softmax(torch.tensor(logits))
prob[:10]

  """Entry point for launching an IPython kernel.


tensor([[0.9711, 0.0289],
        [0.8511, 0.1489],
        [0.1098, 0.8902],
        [0.9599, 0.0401],
        [0.0837, 0.9163],
        [0.9650, 0.0350],
        [0.0459, 0.9541],
        [0.7414, 0.2586],
        [0.1790, 0.8210],
        [0.9490, 0.0510]])

In [76]:
y_pred

array([1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1], dtype=int64)