In [1]:
!pip install rank_bm25
!pip install sentence_transformers 
!pip install gradio
!pip install emoji
!pip install underthesea

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=b7f6757e3f749da707cb34a56c080298654b58a1f29249b28cd43242d1445ce3
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
Collecting

In [142]:
import re
from underthesea import text_normalize

def remove_punctuation_vietnamese(text):
    # Define Vietnamese punctuation characters
    vietnamese_punctuation = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""

    # Create a translation table
    translator = str.maketrans('', '', vietnamese_punctuation)

    # Remove punctuation using the translation table and regex
    cleaned_text = re.sub(f"[{re.escape(vietnamese_punctuation)}]", '', text)

    return cleaned_text

def remove_special_characters(text):
    text = remove_punctuation_vietnamese(text)
    # Define a regular expression pattern for special characters
    pattern = re.compile(r'[^\w\s]', flags=re.UNICODE)
    
    # Use the pattern to replace special characters with an empty string
    try:
        cleaned_text = re.sub(pattern, ' ', text)
    except:
        print(text)
    cleaned_text = cleaned_text.strip()
    cleaned_text = cleaned_text.replace("...", "")
    cleaned_text = text_normalize(cleaned_text)
    text = re.sub(r'\.',' . ',cleaned_text)
    text = re.sub(r'\,', "", text)
    text = re.sub(r'\--', "", text)
    text = text.strip()
    return " ".join(text.lower().split())


In [147]:
from rank_bm25 import BM25Okapi, BM25Plus
import pandas as pd
from transformers import AutoTokenizer
import os 
import numpy as np
from underthesea import word_tokenize
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def t2t(text):
    return word_tokenize(text)

class bm25_search:

    def __init__(self):
        self.document = None
        self.bm25 = None 

    def parse_document(self, path:str):
        assert os.path.exists(path), "File not found"
        df = pd.read_csv(path)
        assert "name" in df.columns and "description" in df.columns
        self.document = df 
        #df["name"].apply(remove_special_characters) +" "+ 
        corpus = (df['name'] + " " + df['description']).apply(remove_special_characters)
        self.corpus = [doc for doc in corpus if isinstance(doc, str)]
        tokenized_corpus = [t2t(doc) for doc in corpus if isinstance(doc, str)]

        self.bm25 = BM25Okapi(tokenized_corpus)
    
    def search(self, query, topk=5):
        assert self.bm25 is not None 
        query = remove_special_characters(query)
        tokenized_query = t2t(query)
        results = self.bm25.get_scores(tokenized_query)#, self.corpus, n=5)
        #print(results)
        indices = np.array(results).argsort()[-topk:][::-1]
        return [(idx, float(results[idx]), self.document.iloc[idx]) for idx in indices] 

In [148]:
bm25 = bm25_search()
bm25.parse_document("/kaggle/input/product-search/test.csv")

In [149]:
bm25.search("Túi Handmade Tự Đan Túi ")[0]

(507,
 7.421229006587408,
 Unnamed: 0                                                  4770
 name           bóp ví nam da bò thật 4u cao cấp dáng đứng han...
 description    1 mô tả sản phẩmbóp ví nam da bò thật 4u cao c...
 Name: 507, dtype: object)

In [150]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('vietdata/product-bi-encoder')

# Corpus with example sentences
corpus = pd.read_csv("/kaggle/input/product-search/test.csv")
corpus["name"] = corpus["name"].apply(remove_special_characters)
corpus["description"] = corpus["description"].apply(remove_special_characters)
corpus_embeddings = embedder.encode(corpus["name"]+" "+corpus["description"], convert_to_tensor=True)

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

In [151]:
def bi_search(query, top_k = 5):
    query = remove_special_characters(query)
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    return [(int(idx), float(score), corpus.iloc[int(idx)]) for score, idx in zip(top_results[0], top_results[1])]

In [152]:
bi_search("điều khiển kèm pin")[1][-1].values

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

array([4954,
       'remote điê u khiê n điê u ho a đa năng dê sư du ng thi ch hơ p vơ i tâ t ca ca c loa i ma y la nh',
       'remote điê u khiê n điê u ho a đa năng dê sư du ng thi ch hơ p vơ i tâ t ca ca c loa i ma y la nh ha ng nhâ p khâ uđiều khiển điều hòa đa năng dùng được cho tất cả các loại máy điều hòa như'],
      dtype=object)

In [153]:
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np

# Pre-trained cross encoder
model = CrossEncoder('vietdata/cross_tech_sbert')

In [154]:
def rerank(query, corpus):
    query = remove_special_characters(query)
    # So we create the respective sentence combinations
    sentence_combinations = [[query, corpus_sentence[2]] for corpus_sentence in corpus]

    # Compute the similarity scores for these combinations
    similarity_scores = model.predict(sentence_combinations)

    # Sort the scores in decreasing order
    sim_scores_argsort = reversed(np.argsort(similarity_scores))

    # Print the scores
    return [(similarity_scores[idx], corpus[idx]) for idx in sim_scores_argsort]

In [167]:
def hybrid_search(query):
    bm25_s = [(i[0], i[1],remove_special_characters(i[2]["name"] + " " + i[2]["description"])) for i in bm25.search(query, 10)]
    bi_s = [(i[0], i[1], remove_special_characters(i[2]["name"] + " " + i[2]["description"])) for i in bi_search(query, 10)]    
    combine = []
    found = set()
    for i in bm25_s+bi_s:
        if i[0] not in found:
            found.add(i[0])
            combine.append(i)
    result = rerank(query, combine)
#     print(bi_s[:2])
#     print(bm25_s[:2])
    return result[:20]

In [168]:
hybrid_search("điểu khiển tv kèm pin")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[(0.8616373,
  (273,
   0.6784879565238953,
   'remote điều khiển dành cho tv led smart tv ti vi thông minh tcl hàng mới 100 chỉ cần lắp pin vào sử dụng ngay không cần cài đặtnguồn 2 viên pin aaakhoảng cách sử dụng 8 mkích thước 210 40 20 mmtrọng lượng 79 g')),
 (0.86107236,
  (327,
   13.244651839684177,
   'remote điều khiển dành cho casper tivi internet smart tv netflix youtube kèm pin remote điều khiển được nhiều model smart tv casperđầy đủ nút bấm chức năng như remote theo tvtốc độ phản hồi nhanhtặng kèm 2 viên pin aaathông tin chi tiết loại sản phẩm remote tvcông dụng điều kh')),
 (0.8608853,
  (217,
   12.284348947186633,
   'remote thay thế điều khiển dành cho philips smart tv tivi thông minh remote thay thế điều khiển được nhiều model tv philipsđầy đủ chức năng như remote theo tvtốc độ phản hồi nhanhtặng kèm pin aaa maxelllắp pin vào dùng ngay không cần cài đặtthông tin chi tiết')),
 (0.86077905,
  (241,
   10.732231406701501,
   'remote tv điều khiển dành cho samsung internet

In [169]:
import pandas as pd

queries = pd.read_csv("/kaggle/input/product-search/testdataset.csv")['question'].values.tolist()

In [None]:
pairs = {}
for i in range(len(queries)):
    item = {"question":queries[i], "results":[]}
    item["results"] = [i[1][0] for i in hybrid_search(queries[i])]
    pairs[i] = item

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

import json

with open("results.json", "w", encoding="utf-8") as f:
    json.dump(pairs, f, ensure_ascii=False)

import json

with open("/kaggle/input/product-search/results.json") as f:
    data = json.load(f)

for k in data:
    data[k]["results"] = data[k]["results"][:3]

import json

with open("results.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False)

In [159]:
def precisionk(preds, targets, k=1):
    score = 0
    for idx, i in enumerate(preds):
        count = 0
        for j in i[:k]:
            if j in targets[idx]:
                count += 1
        score = (idx*score+count/k)/(idx+1)
    return score

In [160]:
def avg_precision(preds, targets):
    score = 0
    for idx, i in enumerate(preds):
        count = 0
        sum_ = 0
        for c, j in enumerate(i):
            if j in targets[idx]:
                count += 1
                sum_ += count/(c+1)
        
        score = (idx*score+(sum_/count if count != 0 else 0))/(idx+1)
    return score

In [161]:
import pandas as pd

targets = pd.read_csv("/kaggle/input/product-search/testdataset.csv")
targets = targets["product_id"].apply(lambda x: eval(x))

In [162]:
targets = targets.values.tolist()

In [163]:
targets[0]

[250, 355, 309]

In [164]:
preds = [pairs[i]["results"] for i in range(len(pairs))]

In [165]:
precisionk(preds, targets, k=5)

0.22000000000000053

In [166]:
avg_precision(preds, targets)

0.3361873897707226

In [None]:
5