In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import os

import datasets
from sentence_transformers.cross_encoder import CrossEncoder, CrossEncoderTrainingArguments
from sentence_transformers.cross_encoder.evaluation import CrossEncoderClassificationEvaluator
from sentence_transformers.cross_encoder.losses import BinaryCrossEntropyLoss
from sentence_transformers.cross_encoder.trainer import CrossEncoderTrainer
import logging
import traceback
from datetime import datetime



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

False

In [6]:
folder_path = ""

path = folder_path + "question_pairs_cleaned.csv/question_pairs_cleaned.csv"
df_other = pd.read_csv(
    path,
    engine="python",
    quotechar='"',
    sep=",",
    names=['id','qid1','qid2','question1','question2','is_duplicate'],
    header=0,
    on_bad_lines='skip'
)

df_other.rename(columns={"is_duplicate": "label"}, inplace=True)
df_other.dropna(inplace=True)
print(df_other.shape)
print(df_other.info())
df_other = df_other[['question1', 'question2', 'label']]
df_other.head()



(403672, 6)
<class 'pandas.core.frame.DataFrame'>
Index: 403672 entries, 0 to 403682
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id         403672 non-null  int64  
 1   qid1       403672 non-null  int64  
 2   qid2       403672 non-null  int64  
 3   question1  403672 non-null  object 
 4   question2  403672 non-null  object 
 5   label      403672 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 21.6+ MB
None


Unnamed: 0,question1,question2,label
0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.0
1,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.0
2,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.0
3,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1.0
4,Should I buy tiago?,What keeps childern active and far from phone ...,0.0


In [15]:
train_val_df, test_df = train_test_split(df_other, test_size=0.2, stratify=df_other.label, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, stratify=train_val_df.label, random_state=42)
ds_train = datasets.Dataset.from_pandas(train_df, preserve_index=False)
ds_val = datasets.Dataset.from_pandas(val_df, preserve_index=False)
ds_test = datasets.Dataset.from_pandas(test_df, preserve_index=False)
print(ds_train)
print(ds_val)
print(ds_test)


Dataset({
    features: ['question1', 'question2', 'label'],
    num_rows: 258349
})
Dataset({
    features: ['question1', 'question2', 'label'],
    num_rows: 64588
})
Dataset({
    features: ['question1', 'question2', 'label'],
    num_rows: 80735
})


# FINE TUNING

In [None]:
# set to true to run training
fine_tune = False
if fine_tune:

    # Set the log level to INFO to get more information
    log_file_name_with_date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_training.log"
    if not os.path.exists(folder_path + "logs"):
        os.makedirs(folder_path + "logs")
    log_file = os.path.join(folder_path + "logs", log_file_name_with_date)
    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )
    train_batch_size = 64
    num_epochs = 4
    output_dir = folder_path + "models/training_ce-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # define model
    model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    model = CrossEncoder(model_name, num_labels=1)

    # define training loss: BCE because we have binary classification
    loss = BinaryCrossEntropyLoss(model)

    # evaluator for the validation set
    dev_cls_evaluator = CrossEncoderClassificationEvaluator(
        sentence_pairs=list(zip(ds_val["question1"], ds_val["question2"])),
        labels=ds_val["label"],
        name="sem_eq_det_val",
    )
    dev_cls_evaluator(model)

    # training arguments
    short_model_name = "ms-marco-MiniLM-L-6-v2"
    run_name = f"finetuned-{short_model_name}-sem-eq-det"
    args = CrossEncoderTrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=train_batch_size,
        warmup_ratio=0.1,
        fp16=False,
        bf16=True,
        eval_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=5,
        logging_steps=100,
        run_name=run_name
    )

    trainer = CrossEncoderTrainer(
        model=model,
        args=args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        loss=loss,
        evaluator=dev_cls_evaluator,
    )
    trainer.train()

    # save the fine tuned model
    final_output_dir = f"{output_dir}/final"
    model.save_pretrained(final_output_dir)

In [None]:
# if training was done, if not, load model further down in notebook
test_preds_trained = model.predict(list(zip(ds_test["question1"], ds_test["question2"])))
test_preds_trained = np.where(test_preds_trained > 0.5, 1, 0)
confusion_matrix(ds_test["label"], test_preds_trained)


# Without fine-tuning

In [None]:
model_name_orig = "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_without_ft = CrossEncoder(model_name_orig, num_labels=1)

test_preds_no_ft_from_model = model_without_ft.predict(list(zip(ds_test["question1"], ds_test["question2"])))
print(test_preds_no_ft_from_model[:5])

[ 8.414917  -3.01737   -1.9269208  1.0711179 -2.7257714]


In [None]:
test_preds_no_ft = test_preds_no_ft_from_model
test_preds_no_ft = torch.sigmoid(torch.Tensor(test_preds_no_ft))
print(test_preds_no_ft[:5])
test_preds_no_ft = np.array(test_preds_no_ft.tolist())
test_preds_no_ft = np.where(test_preds_no_ft > 0.5, 1, 0)
print(test_preds_no_ft[:5])

print(confusion_matrix(ds_test["label"], test_preds_no_ft))
print(classification_report(ds_test["label"], test_preds_no_ft))
print(accuracy_score(ds_test["label"], test_preds_no_ft))

tensor([0.9998, 0.0466, 0.1271, 0.7448, 0.0615])
[1 0 0 1 0]
[[31219 19676]
 [ 3853 25987]]
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.73     50895
         1.0       0.57      0.87      0.69     29840

    accuracy                           0.71     80735
   macro avg       0.73      0.74      0.71     80735
weighted avg       0.77      0.71      0.71     80735

0.7085650585248033


# With fine tuning

In [None]:
model_finetuned_path = "models/finetuned_model"
# load model
model_finetuned = CrossEncoder(model_finetuned_path)

test_preds_with_ft_from_model = model_finetuned.predict(list(zip(ds_test["question1"], ds_test["question2"])))

In [None]:
test_preds_with_ft = np.array(torch.sigmoid(torch.Tensor(test_preds_with_ft_from_model)).tolist())
test_preds_with_ft = np.where(test_preds_with_ft > 0.5, 1, 0)

print(confusion_matrix(ds_test["label"], test_preds_with_ft,))
print(classification_report(ds_test["label"], test_preds_with_ft))
print(accuracy_score(ds_test["label"], test_preds_with_ft))

[[46065  4830]
 [ 3356 26484]]
              precision    recall  f1-score   support

         0.0       0.93      0.91      0.92     50895
         1.0       0.85      0.89      0.87     29840

    accuracy                           0.90     80735
   macro avg       0.89      0.90      0.89     80735
weighted avg       0.90      0.90      0.90     80735

0.8986065523007369
