In [41]:
# !pip install kagglehub
#!pip install kaggle


In [42]:
import kagglehub
import shutil
import os
import re
import string
import pandas as pd
from simhash import SimHash
from simhash_embed import SimHashEmbed
from sklearn.metrics import precision_score, recall_score, f1_score
from sentence_transformers import SentenceTransformer

In [43]:
# path = kagglehub.dataset_download("quora/question-pairs-dataset")
# print("Dataset gốc nằm ở:", path)

# target_dir = "/home/hai/Projects/locality-sensitive-hashing/data/quora_question_pairs"

# os.makedirs(target_dir, exist_ok=True)

# for item in os.listdir(path):
#     s = os.path.join(path, item)
#     d = os.path.join(target_dir, item)
#     if os.path.isdir(s):
#         shutil.copytree(s, d, dirs_exist_ok=True)
#     else:
#         shutil.copy2(s, d)

# print("Dataset đã được copy vào:", target_dir)


In [44]:
device = "cuda"
model = SentenceTransformer("all-MiniLM-L6-v2")
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [52]:
N = 2
def get_ngrams(text, n=N):
    text = re.sub(r'\s+', ' ', str(text)).strip()
    tokens = []
    for i in range(len(text) - n + 1):
        tokens.append(text[i:i+n])      
    return tokens

def get_word_ngrams(text, n=2):
    text = re.sub(r'\s+', ' ', str(text)).strip().lower()
    words = text.split()
    tokens = []
    for i in range(len(words)-n+1):
        tokens.append(" ".join(words[i:i+n]))
    return tokens


def preprocess(text, n=N):
    remove_chars = string.punctuation + '@.'
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', remove_chars))
    tokens = get_word_ngrams(text, n)
    return tokens


In [53]:
df = pd.read_csv("/home/hai/Projects/locality-sensitive-hashing/data/quora_question_pairs/questions.csv")
df = df.sample(1000, random_state=42)
df["sim1"] = [SimHash(preprocess(q)) for q in df["question1"]]
df["sim2"] = [SimHash(preprocess(q)) for q in df["question2"]]
df["dist"] = [s1.distance(s2) for s1, s2 in zip(df["sim1"], df["sim2"])]
K = 15
df["pred"] = (df["dist"] < K). astype(int)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,sim1,sim2,dist,pred
120567,120567,238932,238933,How does the Boggart work?,What would the boggart of a boggart be?,0,<Simhash value=6012720904230808588364809449893...,<Simhash value=8531778456708192513506923224210...,57,0
324466,324466,636476,636477,What is difference between project manager and...,What are the differences between project manag...,0,<Simhash value=2664038227359841099625818255919...,<Simhash value=1193176379361672249056179228874...,58,0
398558,398558,778728,778729,What hotel in Jabalpur would be safe for unmar...,What hotel in Allahabad would be safe for unma...,0,<Simhash value=1243671421109686292810893563980...,<Simhash value=2259589700171580382467293539364...,15,0
339914,339914,666314,666315,What is stronger - Super Saiyan 4 or Super Sai...,How does Gohan turn into Super Saiyan 2?,0,<Simhash value=2294950509384647050009680532638...,<Simhash value=2926234899531282244417180975896...,59,0
185732,185732,366764,366765,How do I fill in Address Line 1 and Address Li...,How do I register desired web address?,0,<Simhash value=2233649162609707169609166079834...,<Simhash value=2565825475580597395369204787094...,58,0


In [54]:
df_eb = pd.read_csv("/home/hai/Projects/locality-sensitive-hashing/data/quora_question_pairs/questions.csv")
df_eb = df_eb.sample(1000, random_state=42)

df_eb["sim1"] = [SimHashEmbed(text=q, f=128, model=model) for q in df["question1"]]
df_eb["sim2"] = [SimHashEmbed(text=q, f=128, model=model) for q in df["question2"]]

df_eb["dist"] = [s1.distance(s2) for s1, s2 in zip(df_eb["sim1"], df_eb["sim2"])]

K = 25
df_eb["pred"] = (df_eb["dist"] < K).astype(int)

print(df_eb[["question1","question2","is_duplicate","dist","pred"]].head())


                                                question1  \
120567                         How does the Boggart work?   
324466  What is difference between project manager and...   
398558  What hotel in Jabalpur would be safe for unmar...   
339914  What is stronger - Super Saiyan 4 or Super Sai...   
185732  How do I fill in Address Line 1 and Address Li...   

                                                question2  is_duplicate  dist  \
120567            What would the boggart of a boggart be?             0    33   
324466  What are the differences between project manag...             0    27   
398558  What hotel in Allahabad would be safe for unma...             0    21   
339914           How does Gohan turn into Super Saiyan 2?             0    42   
185732             How do I register desired web address?             0    45   

        pred  
120567     0  
324466     0  
398558     1  
339914     0  
185732     0  


In [55]:
precision = precision_score(df["is_duplicate"], df["pred"])
recall = recall_score(df["is_duplicate"], df["pred"])
f1 = f1_score(df["is_duplicate"], df["pred"])
print(f"Precision:{precision:.4f}")
print(f"Recall:{recall:.4f}")
print(f"F1-Score:{f1:.4f}")


Precision:0.3636
Recall:0.0117
F1-Score:0.0227


In [56]:
precision = precision_score(df_eb["is_duplicate"], df_eb["pred"])
recall = recall_score(df_eb["is_duplicate"], df_eb["pred"])
f1 = f1_score(df_eb["is_duplicate"], df_eb["pred"])
print(f"Precision:{precision:.4f}")
print(f"Recall:{recall:.4f}")
print(f"F1-Score:{f1:.4f}")

Precision:0.6583
Recall:0.6871
F1-Score:0.6724
