In [7]:
!pip install -q transformers
!pip install --upgrade -q torch
!pip install --quiet -U sentence-transformers

In [1]:
import pandas as pd
import torch
import numpy as np

# Data

In [2]:
train = pd.read_csv('train.csv')
# Making string because someone had number in question and pandas read it as float ;-)
train.question1 = train.question1.apply(str)
train.question2 = train.question2.apply(str)
train.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0


In [3]:
print("Rows: %d\nUnique first questions: %d\nUnique second questions: %d\nUnique questions: %d" % (len(train), train.qid1.nunique(), train.qid2.nunique(), len(np.unique([train.qid1, train.qid2]))))

Rows: 404290
Unique first questions: 290654
Unique second questions: 299364
Unique questions: 537933


In [4]:
val_ratio = 0.1
train, val = train[:int(len(train) * (1 - val_ratio))], train[int(len(train) * (1 - val_ratio)):]
val = val.reset_index().drop(columns=['index'])

# Huggingface Model

In [5]:
from transformers import BertTokenizer, BertConfig, BertForNextSentencePrediction


def tokenizer_to_device(tokenizer, device):
    def deviced(text, return_tensors='pt', padding=True):
        tokens = tokenizer(text, padding=padding, return_tensors=return_tensors)
        tokens = {
            'input_ids': tokens['input_ids'].to(device),
            'token_type_ids': tokens['token_type_ids'].to(device),
            'attention_mask': tokens['attention_mask'].to(device)
        }
        return tokens
    return deviced


device = torch.device("cuda")
model_name = "bert-base-uncased"

bert_tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer = tokenizer_to_device(bert_tokenizer, device)

config = BertConfig(
    output_attentions=True, 
    output_hidden_states=True, 
    return_dict=True)
model = BertForNextSentencePrediction.from_pretrained(model_name, config=config)
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Training

In [20]:
from math import *
from collections import defaultdict


def predict(sentences, tokenizer, model, device, batch_size=64):
    model.eval()
    with torch.no_grad():
        all_logits = []
        sentences = [*sentences]

        batch_n = ceil(len(sentences) / batch_size)
        for batch_i in range(batch_n):
            tokens = tokenizer(sentences[batch_i*batch_size:(batch_i+1)*batch_size])
            prediction = model(**tokens)
            logits = torch.nn.functional.softmax(prediction.logits, dim=1)[:, 0]\
                .to("cpu").detach().numpy()
            all_logits.append(logits)
            del tokens, prediction
            print("\rPrediction batch %5d / %5d\t\t\t\t\t\t\t" % (batch_i + 1, batch_n), end='')

        print("\r", end='')
        return np.concatenate(all_logits)


def preprocess_data(data):
    positive_samples = defaultdict(list)
    negative_samples = defaultdict(list)
    text_by_id = {}
    
    for i, row in data.iterrows():
        if row.is_duplicate:
            positive_samples[row.qid1].append(row.qid2)
            positive_samples[row.qid2].append(row.qid1)
        else:
            negative_samples[row.qid1].append(row.qid2)
            negative_samples[row.qid2].append(row.qid1)
        text_by_id[row.qid1] = row.question1
        text_by_id[row.qid2] = row.question2
        print("\rPreprocessing %10d / %10d" % (i + 1, len(data)), end='')
        
    question_ids = [*set(positive_samples.keys()).intersection(set(negative_samples.keys()))]
    print("\rPreprocessing done, unique valid questions: %d" % len(question_ids))
    
    return {'pos': positive_samples, 'neg': negative_samples, 'text_by_id': text_by_id, 'question_ids': question_ids}
    

def train_triplet(model, tokenizer, preprocessed_data, optimizer, triplet_loss, n_epochs=1, epoch_coverage=1, batch_size=16, sbert=False):
    positive_samples, negative_samples, text_by_id, question_ids = \
        preprocessed_data['pos'], preprocessed_data['neg'], preprocessed_data['text_by_id'], preprocessed_data['question_ids']
    
    model.train()
    
    for epoch_i in range(n_epochs):
        epoch_loss = 0
        size = ceil(len(question_ids) * epoch_coverage / batch_size) * batch_size
        n_batches = ceil(size / batch_size)
        
        for batch_i in range(n_batches):
            optimizer.zero_grad()
            anchor_ids = np.random.choice(question_ids, batch_size, replace=False)
            pos_ids = [np.random.choice(positive_samples[id], 1)[0] for id in anchor_ids]
            neg_ids = [np.random.choice(negative_samples[id], 1)[0] for id in anchor_ids]
            
            ids = [*anchor_ids, *pos_ids, *neg_ids]
            sentences = [text_by_id[i] for i in ids]
            tokens = tokenizer(sentences)
            if sbert:
                embeddings = model(tokens)['sentence_embedding']
            else:
                embeddings = model(**tokens).logits
            anchor, pos, neg = embeddings[:batch_size], embeddings[batch_size:batch_size*2], embeddings[batch_size*2:]
            
            loss = triplet_loss(anchor, pos, neg)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            batch_loss = loss.detach().cpu().item()
            epoch_loss += batch_loss
            print("\rBatch %3d / %3d, Batch loss: %5.3f, Epoch loss: %5.3f" %
                     (batch_i + 1, n_batches, batch_loss, epoch_loss / (batch_i + 1)), end='')
        
        epoch_loss /= n_batches
        print("\rEpoch %3d / %3d, Loss: %5.3f%s" % (epoch_i + 1, n_epochs, epoch_loss, "\t"*12))

Run once:

In [7]:
prep_train = preprocess_data(train)

Preprocessing done, unique valid questions: 21596


Rerun for training:

In [8]:
triplet_loss = torch.nn.TripletMarginLoss(margin=1.0)
optim = torch.optim.AdamW(model.parameters(), lr=3e-5)

In [9]:
train_triplet(model, tokenizer, prep_train, optim, triplet_loss,
             n_epochs=10, epoch_coverage=0.1)

Epoch   1 /  10, Loss: 0.994						, Epoch loss: 0.994
Epoch   2 /  10, Loss: 0.931						, Epoch loss: 0.931
Epoch   3 /  10, Loss: 0.849						, Epoch loss: 0.849
Epoch   4 /  10, Loss: 0.785						, Epoch loss: 0.785
Epoch   5 /  10, Loss: 0.791						, Epoch loss: 0.791
Epoch   6 /  10, Loss: 0.724						, Epoch loss: 0.724
Epoch   7 /  10, Loss: 0.686						, Epoch loss: 0.686
Epoch   8 /  10, Loss: 0.669						, Epoch loss: 0.669
Epoch   9 /  10, Loss: 0.645						, Epoch loss: 0.645
Epoch  10 /  10, Loss: 0.654						, Epoch loss: 0.654


# Accuracy

In [21]:
def calc_encodings(df, sbert=False, batch_size=32):
    questions = [*df[['qid1', 'question1']].values, *df[['qid2', 'question2']].values]
    questions = pd.DataFrame(questions, columns=['id', 'question'])
    questions.question = questions.question.apply(str)
    questions.drop_duplicates(inplace=True)
    questions = questions.reset_index().drop(columns=['index'])
    
    if sbert:
        encod = model.encode(questions.question)
    else:
        encod = predict(questions.question, tokenizer, model, device, batch_size=batch_size)

    encodings = {}    
    for i, item in questions.iterrows():
        encodings[item.id] = encod[i]
    
    return np.stack(df['qid1'].apply(lambda x: encodings[x]).values), np.stack(df['qid2'].apply(lambda x: encodings[x]).values)
    
def calc_threshold(df, encod1, encod2):
    distances = np.linalg.norm(encod1 - encod2, axis=1)
    
    positive_distance = distances[df.is_duplicate].mean()
    negative_distance = distances[~df.is_duplicate].mean()
    threshold = (positive_distance + negative_distance) / 2

    print("Positive d: %.3f, Avg d: %.3f, Negative d: %.3f" % (positive_distance, threshold, negative_distance))
    
    df['dist'] = distances
    return positive_distance, threshold, negative_distance

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def loss(y, y_pred):
    return -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

def calc_metrics(df, positive_dist, threshold, negative_dist):
    pred = sigmoid(-(df['dist'] - threshold) / (negative_distance - positive_distance))
    accuracy = np.mean((pred > 0.5) == df.is_duplicate)
    log_loss = loss(df.is_duplicate, pred)

    print("Accuracy: %.5f" % accuracy)
    print("Log Loss: %.5f" % log_loss)

Optimizer is taking too much space, **needs to be deleted :-(**

In [None]:
del optim
torch.cuda.empty_cache()

In [None]:
encod1, encod2 = calc_encodings(train, batch_size=32)
positive_dist, threshold, negative_dist = calc_threshold(train, encod1, encod2)

In [188]:
calc_metrics(train, positive_dist, threshold, negative_dist)

Accuracy: 0.64153
Log Loss: 0.66817


In [189]:
encod1, encod2 = calc_encodings(val)
calc_threshold(val, encod1, encod2);

Positive d: 0.474, Avg d: 0.556, Negative d: 0.639


In [190]:
calc_metrics(val, positive_dist, threshold, negative_dist)

Accuracy: 0.64840
Log Loss: 0.66671
