<a href="https://colab.research.google.com/github/vibhav-rai/quora-question-pairs/blob/main/quora_question_pair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('/content/train.csv')

In [3]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
train_df.shape

(404290, 6)

In [5]:
train_df_share = train_df['is_duplicate'].value_counts().reset_index()
total_cnt = train_df.shape[0]
train_df_share['perc'] = ((train_df_share['count']/total_cnt)*100).round(2)
train_df_share.head()

Unnamed: 0,is_duplicate,count,perc
0,0,255027,63.08
1,1,149263,36.92


In [6]:
train_df_v1, test_df_v1 = train_test_split(train_df, test_size = 0.2, stratify = train_df['is_duplicate'], random_state = 42)
train_df_v1, val_df_v1 = train_test_split(train_df_v1, test_size = 0.2, stratify = train_df_v1['is_duplicate'], random_state = 42)

print(train_df_v1.shape)
print(test_df_v1.shape)
print(val_df_v1.shape)

(258745, 6)
(80858, 6)
(64687, 6)


In [7]:
train_perc = (train_df_v1.shape[0]/total_cnt)*100
val_perc = (val_df_v1.shape[0]/total_cnt)*100
test_perc = (test_df_v1.shape[0]/total_cnt)*100

print(train_perc)
print(val_perc)
print(test_perc)

63.99985159167924
16.00014840832076
20.0


In [8]:
train_df_v1_share = train_df_v1['is_duplicate'].value_counts().reset_index()
total_cnt = train_df_v1.shape[0]
train_df_v1_share['perc'] = ((train_df_v1_share['count']/total_cnt)*100).round(2)
train_df_v1_share.head()

Unnamed: 0,is_duplicate,count,perc
0,0,163217,63.08
1,1,95528,36.92


In [9]:
# converting pandas DataFrames into a Hugging Face Dataset format, which is what the Transformers library expects for training, evaluation, and tokenization.
ds = DatasetDict({
    "train" : Dataset.from_pandas(train_df_v1, preserve_index = False),
    "validation" : Dataset.from_pandas(val_df_v1, preserve_index = False),
    "test" : Dataset.from_pandas(test_df_v1, preserve_index = False)
})


In [10]:
from datasets import Value

# Replace None/NaN with empty strings
def clean_text(e):
    q1 = e["question1"]
    q2 = e["question2"]
    e["question1"] = "" if q1 is None or (isinstance(q1, float) and q1 != q1) else str(q1)
    e["question2"] = "" if q2 is None or (isinstance(q2, float) and q2 != q2) else str(q2)
    return e

ds = ds.map(clean_text)

ds = ds.cast_column("question1", Value("string"))
ds = ds.cast_column("question2", Value("string"))

# Tokenizer + map
name = 'bert-base-uncased'
tok = AutoTokenizer.from_pretrained(name)

def tok_fn(batch):
  return tok( batch['question1'], batch['question2'], truncation = True, max_length = 160, padding = False )


ds_tok = ds.map(tok_fn, batched = True).rename_column('is_duplicate', 'labels').remove_columns(['question1', 'question2'])
ds_tok.set_format('torch')

Map:   0%|          | 0/258745 [00:00<?, ? examples/s]

Map:   0%|          | 0/64687 [00:00<?, ? examples/s]

Map:   0%|          | 0/80858 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/258745 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64687 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/80858 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/258745 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64687 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/80858 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/258745 [00:00<?, ? examples/s]

Map:   0%|          | 0/64687 [00:00<?, ? examples/s]

Map:   0%|          | 0/80858 [00:00<?, ? examples/s]

In [11]:
print(ds_tok)

DatasetDict({
    train: Dataset({
        features: ['id', 'qid1', 'qid2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 258745
    })
    validation: Dataset({
        features: ['id', 'qid1', 'qid2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 64687
    })
    test: Dataset({
        features: ['id', 'qid1', 'qid2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 80858
    })
})


In [12]:
!pip install evaluate
import evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [13]:
import evaluate, numpy as np
acc = evaluate.load("accuracy"); f1 = evaluate.load("f1")
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": acc.compute(predictions=preds, references=p.label_ids)["accuracy"],
        "f1": f1.compute(predictions=preds, references=p.label_ids, average="binary")["f1"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(name, num_labels = 2)

args = TrainingArguments(
    output_dir="qqp-min",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    weight_decay=0.01,
    fp16=True,
    report_to= "none",
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Train + Quick Evaluation
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = ds_tok['train'],
    eval_dataset = ds_tok['validation'],
    tokenizer = tok,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=0)]
)

trainer.train()
print(trainer.evaluate(ds_tok['test']))

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2644,0.260761,0.890967,0.85792


{'eval_loss': 0.25535374879837036, 'eval_accuracy': 0.8930965396126542, 'eval_f1': 0.860827563999356, 'eval_runtime': 85.5707, 'eval_samples_per_second': 944.926, 'eval_steps_per_second': 29.531, 'epoch': 1.0}
