In [5]:
import re
from underthesea import word_tokenize
from rank_bm25 import BM25Okapi, BM25Plus
import pickle

In [6]:
file = open('data.txt',encoding='utf-8').read().split("\n")
data = []
for line in file:
    if re.search("^.*\.", line) or re.search("^.*\)", line):
        data.append(line.lower())

stopwords = open('../stopwords.txt', encoding='utf-8').read().split(" ")

In [7]:
def removeStopwords(sentence):
  words = word_tokenize(sentence)
  word = [w.lower() for w in words if w not in (stopwords)]
  sentence_clean = " ".join(word)
  
  return sentence_clean

In [8]:
def bm25okapi_search(tokenized_query, bm25, corpus, n_results = 1):
    """
    Function that takes a tokenized query and prints the first 100 words of the 
    n_results most relevant results found in the corpus, based on the BM25
    method.
    
    Parameters
    ----------
    @param tokenized_query: list, array-like
        A valid list containing the tokenized query.
    @param bm25: BM25 object,
        A valid object of type BM25 (BM25Okapi or BM25Plus) from the library
        `rank-bm25`, initialized with a valid corpus.
    @param corpus: list, array-like
        A valid list containing the corpus from which the BM25 object has been 
        initialized. As returned from function read_corpus().
    @param n_results: int, default = 1
        The number of top results to print.
    """
    
    # We skip checking validity of arguments for now... We assume the user 
    # knows what they're doing.
    
    # Get top results for the query
    top_results = bm25.get_top_n(tokenized_query, corpus, n = n_results)
    top_results_100words = [' '.join(top_result.split(' ')) 
                             for top_result in top_results]
    
    return top_results_100words

In [9]:
tokenized_data = [word_tokenize(doc) for doc in data]


In [11]:
bm25 = BM25Okapi(tokenized_data)

In [26]:
# Save model to load
with open('bm25result', 'wb') as bm25result_file:
    pickle.dump(bm25, bm25result_file)

In [27]:
#to read bm25 object
with open('bm25result', 'rb') as bm25result_file:
    bm25result = pickle.load(bm25result_file)

In [28]:
query = "b·ªô lu·∫≠t d√¢n s·ª± l√† g√¨ ?"
print(removeStopwords(query))
tokenized_query = word_tokenize(removeStopwords(query))

bm25okapi_search(tokenized_query = tokenized_query,
                 bm25 = bm25, 
                 corpus = data,
                 n_results = 5)

b·ªô lu·∫≠t d√¢n s·ª±


['ƒë·ªëi v·ªõi giao d·ªãch d√¢n s·ª± ƒë∆∞·ª£c x√°c l·∫≠p tr∆∞·ªõc ng√†y b·ªô lu·∫≠t n√†y c√≥ hi·ªáu l·ª±c th√¨ vi·ªác √°p d·ª•ng ph√°p lu·∫≠t ƒë∆∞·ª£c quy ƒë·ªãnh nh∆∞ sau: giao d·ªãch d√¢n s·ª± ch∆∞a ƒë∆∞·ª£c th·ª±c hi·ªán m√† c√≥ n·ªôi dung, h√¨nh th·ª©c kh√°c v·ªõi quy ƒë·ªãnh c·ªßa b·ªô lu·∫≠t n√†y th√¨ ch·ªß th·ªÉ giao d·ªãch ti·∫øp t·ª•c th·ª±c hi·ªán theo quy ƒë·ªãnh c·ªßa b·ªô lu·∫≠t d√¢n s·ª± s·ªë 33/2005/qh11 v√† c√°c vƒÉn b·∫£n quy ph·∫°m ph√°p lu·∫≠t quy ƒë·ªãnh chi ti·∫øt b·ªô lu·∫≠t d√¢n s·ª± s·ªë 33/2005/qh11, tr·ª´ tr∆∞·ªùng h·ª£p c√°c b√™n c·ªßa giao d·ªãch d√¢n s·ª± c√≥ th·ªèa thu·∫≠n v·ªÅ vi·ªác s·ª≠a ƒë·ªïi, b·ªï sung n·ªôi dung, h√¨nh th·ª©c c·ªßa giao d·ªãch ƒë·ªÉ ph√π h·ª£p v·ªõi b·ªô lu·∫≠t n√†y v√† ƒë·ªÉ √°p d·ª•ng quy ƒë·ªãnh c·ªßa b·ªô lu·∫≠t n√†y, giao d·ªãch d√¢n s·ª± ƒëang ƒë∆∞·ª£c th·ª±c hi·ªán m√† c√≥ n·ªôi dung, h√¨nh th·ª©c kh√°c v·ªõi quy ƒë·ªãnh c·ªßa b·ªô lu·∫≠t n√†y th√¨ √°p d·ª•ng quy ƒë·ªãnh c·ªßa b·ªô lu·∫≠t d√¢n s·ª± s·ªë 33/2005/qh11 v√† c√°c 

In [31]:
query = "Hi·ªáu l·ª±c thi h√†nh b·ªô lu·∫≠t d√¢n s·ª± l√† khi n√†o ?"
print("input question: ", query)
print("Cleaned question: ",removeStopwords(query))
tokenized_query = word_tokenize(removeStopwords(query))
bm25okapi_search(tokenized_query = tokenized_query,
                 bm25 = bm25, 
                 corpus = data,
                 n_results = 1)

input question:  Hi·ªáu l·ª±c thi h√†nh b·ªô lu·∫≠t d√¢n s·ª± l√† khi n√†o ?
Cleaned question:  hi·ªáu l·ª±c thi h√†nh b·ªô lu·∫≠t d√¢n s·ª± n√†o


['b·ªô lu·∫≠t n√†y c√≥ hi·ªáu l·ª±c thi h√†nh t·ª´ ng√†y 01 th√°ng 01 nƒÉm 2017, b·ªô lu·∫≠t d√¢n s·ª± s·ªë 33/2005/qh11 h·∫øt hi·ªáu l·ª±c k·ªÉ t·ª´ ng√†y b·ªô lu·∫≠t n√†y c√≥ hi·ªáu l·ª±c, b·ªô lu·∫≠t n√†y ƒë√£ ƒë∆∞·ª£c qu·ªëc h·ªôi n∆∞·ªõc c·ªông h√≤a x√£ h·ªôi ch·ªß nghƒ©a vi·ªát nam kh√≥a xiii, k·ª≥ h·ªçp th·ª© 10 th√¥ng qua ng√†y 24 th√°ng 11 nƒÉm 2015 ; ƒëi·ªÅu 689.']

In [1]:
from datasets import Dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function read file test
def read_data_test(filePath):
    f = open(filePath,encoding='utf-8')
    fileRead = json.load(f)
    question = []
    answer = []
    for object in fileRead:
        question.append(object["question"].lower())
        answer.append(object["answer"].lower())
    dataset = {
    "question": question,
    "answer": answer
    }
    return dataset

In [3]:
# To get score predict with bleu
test_data = read_data_test("./QA_data/qa_test.json")
test_data = Dataset.from_dict(test_data)

In [13]:
predicts = []
for i in test_data["question"]:
    tokenized_query = word_tokenize(removeStopwords(i))
    result = bm25okapi_search(tokenized_query = tokenized_query,
                 bm25 = bm25, 
                 corpus = data,
                 n_results = 1)
    predicts.append(result)
    

In [18]:
from nltk.translate.bleu_score import sentence_bleu
#function to get score of results
def getScore(ref, candi):
    score = sentence_bleu(ref, candi)
    return float('{:.4f}'.format(score))

In [20]:
reference= []
for i in test_data["answer"]:
    reference.append(i.split())


In [35]:
scores = []
for i in predicts:
    scores.append(getScore(reference,i[0].split()))

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [36]:
average = sum(scores)/len(scores)
average

0.42416091954023005

In [37]:
import datasets
rouge = datasets.load_metric("rouge")
rouge_output = rouge.compute(predictions=predicts, references=test_data["answer"], rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)


  


Score(precision=0.265196193452529, recall=0.8852439869308493, fmeasure=0.3662911535955582)
