In [1]:
import pandas as pd
from underthesea import word_tokenize
from bm25 import BM25Okapi
import numpy as np
import ast

In [2]:
import re
def remove_punctuation_vietnamese(text):
    # Define Vietnamese punctuation characters
    vietnamese_punctuation = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""

    # Create a translation table
    translator = str.maketrans('', '', vietnamese_punctuation)

    # Remove punctuation using the translation table and regex
    cleaned_text = re.sub(f"[{re.escape(vietnamese_punctuation)}]", '', text)

    return cleaned_text

In [3]:
def preprocess(text):
    dict_map = {
    "òa": "oà",
    "Òa": "Oà",
    "ÒA": "OÀ",
    "óa": "oá",
    "Óa": "Oá",
    "ÓA": "OÁ",
    "ỏa": "oả",
    "Ỏa": "Oả",
    "ỎA": "OẢ",
    "õa": "oã",
    "Õa": "Oã",
    "ÕA": "OÃ",
    "ọa": "oạ",
    "Ọa": "Oạ",
    "ỌA": "OẠ",
    "òe": "oè",
    "Òe": "Oè",
    "ÒE": "OÈ",
    "óe": "oé",
    "Óe": "Oé",
    "ÓE": "OÉ",
    "ỏe": "oẻ",
    "Ỏe": "Oẻ",
    "ỎE": "OẺ",
    "õe": "oẽ",
    "Õe": "Oẽ",
    "ÕE": "OẼ",
    "ọe": "oẹ",
    "Ọe": "Oẹ",
    "ỌE": "OẸ",
    "ùy": "uỳ",
    "Ùy": "Uỳ",
    "ÙY": "UỲ",
    "úy": "uý",
    "Úy": "Uý",
    "ÚY": "UÝ",
    "ủy": "uỷ",
    "Ủy": "Uỷ",
    "ỦY": "UỶ",
    "ũy": "uỹ",
    "Ũy": "Uỹ",
    "ŨY": "UỸ",
    "ụy": "uỵ",
    "Ụy": "Uỵ",
    "ỤY": "UỴ",
    }
    for i, j in dict_map.items():
        text = text.replace(i, j)
    text = re.sub(r'\.',' . ',text)
    text = re.sub(r'\,', "", text)
    text = re.sub(r'\--', "", text)
    text = text.strip()
    text = " ".join(text.split())
    text = text.lower()

    return text

In [4]:
testdataset_df = pd.read_csv("testdataset.csv")
test = pd.read_csv("test 1.csv")

In [5]:
from bm25 import BM25Okapi
a = test['description'] + " " + test['name']
corpus = a.dropna().apply(preprocess).apply(remove_punctuation_vietnamese).apply(word_tokenize).tolist()
model = BM25Okapi(corpus)

In [6]:
tokenized_testdataset = testdataset_df['question'].dropna().apply(preprocess).apply(remove_punctuation_vietnamese).apply(word_tokenize)

In [9]:
for k in [1, 5, 10]:
    test_pred = []
    for i, query in enumerate(tokenized_testdataset.tolist()):
        # test_pred.append(model.get_top_n(query, corpus, n = len(testdataset_df['product_id'])))
        scores = model.get_scores(query)
        top_n = np.argsort(scores)[::-1][:k]
        test_pred.append(top_n)

    def calculate_map(test_Y, pred):

        Q = len(test_Y)
        ap = []

        # loop through and calculate AP for each query q
        preds_at_k = []
        for q in range(Q):
            ap_num = 0
            # loop through k values
            sum_pred = 0
            for x in range(len(pred[q])):
                # calculate precision@k
                act_set = set(test_Y[q])                                                                                                                                   
                pred_set = set(pred[q][:x+1])
                precision_at_k = len(act_set & pred_set) / (x+1)
                # sum_pred_at_k = len(act_set & pred_set) / (k)
                # calculate rel_k values
                if pred[q][x] in test_Y[q]:
                    rel_k = 1
                else:
                    rel_k = 0
                # calculate numerator value for ap
                ap_num += precision_at_k * rel_k
            # now we calculate the AP value as the average of AP
            # numerator values
            ap_q = ap_num / len(test_Y[q])
            # print(f"AP@{len(pred[q])}_{q+1} = {round(ap_q,2)}")
            ap.append(ap_q)
            
            preds_at_k.append(precision_at_k)

        # now we take the mean of all ap values to get mAP
        map_at_k = sum(ap) / Q
        print(f"precision@{len(pred[q])} = {sum(preds_at_k)/Q}")
        # generate results
        print(f"mAP@{len(pred[q])} = {round(map_at_k, 4)}")

        return map_at_k

    calculate_map(testdataset_df['product_id'].apply(ast.literal_eval).tolist(), test_pred)

precision@1 = 0.25555555555555554
mAP@1 = 0.0627
precision@5 = 0.21388888888888935
mAP@5 = 0.1766
precision@10 = 0.1583333333333335
mAP@10 = 0.215
