In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 14.1MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 58.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 65.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=628228d6771f

In [None]:
# install gdown to download the shared files in google drive
!pip install gdown



In [None]:
# download quora-question-pairs dataset 
!gdown --id 1nAEIkp3tGBSIetFxojp2hRigp34L6eyW

Downloading...
From: https://drive.google.com/uc?id=1nAEIkp3tGBSIetFxojp2hRigp34L6eyW
To: /content/quora-question-pairs.zip
324MB [00:05, 64.4MB/s]


In [None]:
!unzip quora-question-pairs.zip
!unzip train.csv.zip

Archive:  /content/quora-question-pairs.zip
  inflating: sample_submission.csv.zip  
  inflating: test.csv                
  inflating: test.csv.zip            
  inflating: train.csv.zip           


# Quora Question Pair

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()

    print('There are %d GPU(s) available.' % n_gpu)

    print('We will use the GPU:', [torch.cuda.get_device_name(i) for i in range(n_gpu)])

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: ['Tesla V100-SXM2-16GB']


## all required package

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

#import bert tokenizer
from transformers import  BertTokenizer
#import bert classification for finetuning
from transformers import BertForSequenceClassification
# import adamw optimizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

import time
import datetime
import random
from tqdm import tqdm

In [None]:
# since the dataset is a little bit big, I just use a small part of the dataset aim to reduce the training time

train_data = pd.read_csv("./train.csv", index_col="id",nrows = 50000)
train_data.head(6)

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1


In [None]:
test_data = pd.read_csv("./test.csv", index_col="test_id",nrows = 1000)
test_data.head()

Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,What but is the best way to send money from Ch...,What you send money to China?
3,Which food not emulsifiers?,What foods fibre?
4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [None]:
# train_validation data split
X_train, X_val, y_train, y_val = train_test_split(train_data[["question1", "question2"]], 
                                                    train_data["is_duplicate"], test_size=0.2, random_state=2020)
X_train.head()

Unnamed: 0_level_0,question1,question2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
136,Does it matter whether humans are selfish or e...,Does it matter whether humanity is evil or not?
5292,What is the treatment for Prostate Enlargement?,What are treatments for prostate stones?
1293,What was Marc Srour like as a teenager?,What is it like to be Marc Srour?
25308,What reasons do people have to not join Facebook?,How can you invite people by email to join a F...
12103,Why are some people afraid of clowns?,Why are people so scared of clowns?


## convert data to Bert input

In [None]:
# load bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#calculate the maximum sentence length
max_len  = 0
for _, row in train_data.iterrows():
    max_len = max(max_len, len(tokenizer(row['question1'],row['question2'])["input_ids"]))

print("max token length of the input:", max_len)
    
# set the maximum token length
max_length = pow(2,int(np.log2(max_len)))
print("max token length for BERT:", max_length)

max token length of the input: 329
max token length for BERT: 256


In [None]:
# func to convert data to bert input
def convert_to_dataset_torch(data: pd.DataFrame, labels = pd.Series(data=None)) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["question1"], row["question2"], max_length=max_length, pad_to_max_length=True, 
                      return_attention_mask=True, return_tensors='pt', truncation=True)
        # Add the encoded sentences to the list.
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])
    
    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    if labels.empty:
        return TensorDataset(input_ids, attention_masks, token_type_ids)
    else:
        labels = torch.tensor(labels.values)
        return TensorDataset(input_ids, attention_masks, token_type_ids, labels)

  


In [None]:
train = convert_to_dataset_torch(data=X_train, labels=y_train)
validation = convert_to_dataset_torch(data=X_val, labels= y_val)

100%|██████████| 40000/40000 [00:41<00:00, 964.21it/s]
100%|██████████| 10000/10000 [00:10<00:00, 979.41it/s]


## train

In [None]:
# set batch size for DataLoader(options from paper:16 or 32)
batch_size = 32

# Create the DataLoaders for training and validation sets
train_dataloader = DataLoader(
            train,  
            sampler = RandomSampler(train), # Select batches randomly
            batch_size = batch_size 
        )

# For validation
validation_dataloader = DataLoader(
            validation, 
            sampler = SequentialSampler(validation), # Pull out batches sequentially.
            batch_size = batch_size 
        )

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels=2, # The number of output labels--2  
    output_attentions=False, # Whether returns attentions weights.
    output_hidden_states=False, # Whether returns all hidden-states.
)
model.cuda()
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate
                  eps = 1e-8 # args.adam_epsilon
                )

In [None]:
# Number of training epochs
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def fit_batch(dataloader, model, optimizer, epoch):
    total_train_loss = 0
    
    for batch in tqdm(dataloader, desc=f"Training epoch:{epoch+1}", unit="batch"):
        # Unpack batch from dataloader.
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        labels = batch[3].to(device)
        
        # clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        
        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(input_ids, 
                        token_type_ids=token_type_ids, 
                        attention_mask=attention_masks, 
                        labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # normlization of the gradients to 1.0 to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        
    return total_train_loss

In [None]:
def eval_batch(dataloader, model, metric=accuracy_score):
    total_eval_accuracy = 0
    total_eval_loss = 0
    predictions , predicted_labels = [], []
    
    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
        # Unpack batch from dataloader.
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        labels = batch[3].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            outputs = model(input_ids, 
                            token_type_ids=token_type_ids, 
                            attention_mask=attention_masks,
                            labels=labels)
            loss = outputs[0]
            logits = outputs[1]
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of validation sentences, and
        # accumulate it over all batches.
        y_pred = np.argmax(logits, axis=1).flatten()
        total_eval_accuracy += metric(label_ids, y_pred)
        
        predictions.extend(logits.tolist())
        predicted_labels.extend(y_pred.tolist())
    
    return total_eval_accuracy, total_eval_loss, predictions ,predicted_labels

In [None]:
def train(train_dataloader, validation_dataloader, model, optimizer, epochs):
    # We'll store a number of quantities such as training and validation loss, 
    # validation accuracy, and timings.
    training_stats = []
    
    # Measure the total training time for the whole run.
    total_t0 = time.time()
    
    for epoch in range(0, epochs):
        # Measure how long the training epoch takes.
        t0 = time.time()
        
        # Reset the total loss for this epoch.
        total_train_loss = 0
        
        # Put the model into training mode. 
        model.train()
        
        total_train_loss = fit_batch(train_dataloader, model, optimizer, epoch)
        
        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)
        
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)
        
        t0 = time.time()
        
        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()
        

        total_eval_accuracy, total_eval_loss, _, _ = eval_batch(validation_dataloader, model)
        
        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("\n")
        print(f"score: {avg_val_accuracy}")
    
        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)
    
        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)
    
        print(f"Validation Loss: {avg_val_loss}")
        print("\n")
    
        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. score.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )
        

    print("")
    print("Training complete!")

    print(f"Total training took {format_time(time.time()-total_t0)}")
    return training_stats

In [None]:
def predict(dataloader, model):
    prediction = torch.tensor([])
    
    for batch in tqdm(dataloader, desc="predicting", unit="batch"):
        # Unpack batch from dataloader.
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            outputs = model(input_ids, 
                            token_type_ids=token_type_ids, 
                            attention_mask=attention_masks)
        logits = outputs[0]
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu()
        # obtain the probabilty of each label
        prob = torch.nn.functional.softmax(logits)
        
        prediction = torch.cat((prediction, prob),0)
        
    pred_logits =prediction.numpy()
    pred_label = np.argmax(pred_logits, axis=1).flatten()
    print("done")
    return (pred_label,pred_logits)

In [None]:
# Set the seed value all over the place to make this reproducible.
seed_val = 2020

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if n_gpu > 1:
    torch.cuda.manual_seed_all(seed_val)

training_stats = train(train_dataloader, validation_dataloader, model, optimizer, epochs)

Training epoch:1: 100%|██████████| 1250/1250 [09:29<00:00,  2.19batch/s]
Evaluating: 100%|██████████| 313/313 [00:45<00:00,  6.93batch/s]
Training epoch:2:   0%|          | 0/1250 [00:00<?, ?batch/s]



score: 0.8563298722044729
Validation Loss: 0.33124485759498973




Training epoch:2: 100%|██████████| 1250/1250 [09:30<00:00,  2.19batch/s]
Evaluating: 100%|██████████| 313/313 [00:45<00:00,  6.93batch/s]
Training epoch:3:   0%|          | 0/1250 [00:00<?, ?batch/s]



score: 0.8654153354632588
Validation Loss: 0.3236829898001763




Training epoch:3: 100%|██████████| 1250/1250 [09:31<00:00,  2.19batch/s]
Evaluating: 100%|██████████| 313/313 [00:45<00:00,  6.92batch/s]



score: 0.8651158146964856
Validation Loss: 0.3816519981047835



Training complete!
Total training took 0:30:47





In [None]:
df_stats = pd.DataFrame(training_stats).set_index('epoch')
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. score.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.390552,0.331245,0.85633,0:09:30,0:00:45
1,0.242538,0.323683,0.865415,0:09:30,0:00:45
2,0.15689,0.381652,0.865116,0:09:31,0:00:45


# Predition for test set

In [None]:
# Create the DataLoader for test data.
prediction_data = convert_to_dataset_torch(test_data)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

100%|██████████| 1000/1000 [00:01<00:00, 963.05it/s]


In [None]:
y_pred,probs = predict(prediction_dataloader,model)

predicting: 100%|██████████| 32/32 [00:04<00:00,  7.15batch/s]

done





In [None]:
y_pred[:5]

array([0, 0, 0, 0, 0])

In [None]:
probs[:5]

array([[9.9928904e-01, 7.1098027e-04],
       [9.0432817e-01, 9.5671751e-02],
       [9.9618608e-01, 3.8138814e-03],
       [9.9849987e-01, 1.5001176e-03],
       [9.8352069e-01, 1.6479332e-02]], dtype=float32)