# Project: Kalapa Challenges - Vietnamese Medical Question Answering

## Importing library packages

In [1]:
import re
import os
import math
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

## Preprocessing Corpus dataset

In [4]:
files = os.listdir("../corpus")
corpus = []

# Preprocessing corpus dataset (remove html tags, links, unnecessary characters, ...)
if not os.path.exists("../preprocessed_corpus"):
    os.makedirs("../preprocessed_corpus")
for file in files:
    dat = ""
    with open("../corpus/" + file, "r") as f:
        data = f.read()
        dat = re.sub("(<\S+>)", "" ,data)
        dat = re.sub("(^https?:\/\/\S+)", "", dat)
        dat = dat.replace(">", "")
        corpus.append(dat)
        
    with open("../preprocessed_corpus/" + file, "w") as f:
        f.write(dat)
    
print(f"Corpus size: {len(corpus)}")

Corpus size: 603


In [5]:
# Initialize TF-IDF model
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0.0, stop_words = 'english')

# Vectorizing the documents into sparse matrix
tfidf_matrix =  tf.fit_transform(corpus)
feature_names = tf.get_feature_names() 

# Matrix TF-IDF with each row and each colums representing each document in the corpus (disease) and word phrase extracted from the corpus
print(f"Vector shape: {tfidf_matrix.shape}")

# Convert the sparse matrix into complete matrix
dense = tfidf_matrix.todense()

Vector shape: (603, 874363)


## Preprocessing testing dataset

In [11]:
import pandas as pd 

test = pd.read_csv("../public_test.csv")

# Make list of all the options of all questions
cols = [col for col in test.columns if "option" in col]
# Remove options with nan valuesP
raw_options = []
for index, row in test[cols].iterrows():
    raw_options.append(row.dropna().values)

def process_element(element):
    return np.array([re.sub(r'^[A-Z]\.', '', item).strip() for item in element], dtype='object')

options = [process_element(item) for item in raw_options]
print(f"Answer sample: {options[np.random.randint(0, len(options))]}")

# Vectorize list of questions into matrix with each row having the same length as the vectorized corpus matrix
question_vec = test["question"].apply(lambda x: tf.transform([x]).todense()).values

Answer sample: ['Vi khuẩn lậu' 'Chlamydia' 'Không có tác nhân nào gây ra']


In [34]:
idx_ls = []
# Find the most similar document in the corpus for each question represented as the index of the word phrase with the highest score
for question in tqdm(question_vec):
    idx_ls.append(linear_kernel(question, dense).argmax())

100%|██████████| 100/100 [00:59<00:00,  1.67it/s]


---

## Inference

In [35]:
class BM25:

    def __init__(self, k1=1.5, b=0.75):
        self.b = b
        self.k1 = k1

    def fit(self, corpus):
        """
        Fit the various statistics that are required to calculate BM25 ranking
        score using the corpus given.

        Parameters
        ----------
        corpus : list[list[str]]
            Each element in the list represents a document, and each document
            is a list of the terms.

        Returns
        -------
        self
        """
        tf = []
        df = {}
        idf = {}
        doc_len = []
        corpus_size = 0
        for document in corpus:
            corpus_size += 1
            doc_len.append(len(document))

            # compute tf (term frequency) per document
            frequencies = {}
            for term in document:
                term_count = frequencies.get(term, 0) + 1
                frequencies[term] = term_count

            tf.append(frequencies)

            # compute df (document frequency) per term
            for term, _ in frequencies.items():
                df_count = df.get(term, 0) + 1
                df[term] = df_count

        for term, freq in df.items():
            idf[term] = math.log(1 + (corpus_size - freq + 0.5) / (freq + 0.5))

        self.tf_ = tf
        self.df_ = df
        self.idf_ = idf
        self.doc_len_ = doc_len
        self.corpus_ = corpus
        self.corpus_size_ = corpus_size
        self.avg_doc_len_ = sum(doc_len) / corpus_size
        return self

    def search(self, query):
        scores = [self._score(query, index) for index in range(self.corpus_size_)]
        return scores

    def _score(self, query, index):
        score = 0.0

        doc_len = self.doc_len_[index]
        frequencies = self.tf_[index]
        for term in query:
            if term not in frequencies:
                continue

            freq = frequencies[term]
            numerator = self.idf_[term] * freq * (self.k1 + 1)
            denominator = freq + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_len_)
            score += (numerator / denominator)

        return score

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [39]:
# Import library
import random
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm 

def inference(idx_ls, test, corpus, retrieval_model, embedding_model, segementing_model, k:int=5):
    res = {"id": [],
           "answer": []}
    
    for i in tqdm(range(len(idx_ls))):
        # Get the index of the document in the corpus for the corresponding question
        idx = idx_ls[i]
        # print(f"Question: {idx}")
        
        question = test["question"][i]
        # print("question: ", question)
        # print("options: ", options[i])
        
        # Tokenize the document
        sentences_list = corpus[idx].replace("\n", " ").split(". ")
        words_lists = [[word for word in sentence.lower().split()] for sentence in sentences_list]
        
        # Fit BM25 model to the document
        retrieval_model.fit(words_lists)

        # Split question into sequence of single words
        query = question.split()

        # Score each preprocessed sentence in the selected document above
        scores = retrieval_model.search(query)
        # Sort the score list in descending order
        scores_index = np.argsort(scores)[::-1]

        # Get top k sentences (the top candidate in the list order is yet to be the best sentence)
        top_k = np.array([sentences_list[i] for i in scores_index])[:k]

        # Segmenting sentences of top k candidates, questions, and options
        segmented_top_k = []
        for sentence in top_k:
            segmented_sentence = segementing_model.word_segment(sentence)
            segmented_top_k.append(" ".join(segmented_sentence))

        segmented_question = " ".join(segementing_model.word_segment(question))

        segmented_options = []
        for option in options[i]:
            segmented_option = segementing_model.word_segment(option)
            segmented_options.append(" ".join(segmented_option))
        
        # Word embedding for top candidates
        top_k_embedding = np.zeros((len(segmented_top_k), 768))
        for index, segmented_sentence in enumerate(segmented_top_k):
            top_k_embedding[index] = embedding_model.encode(segmented_sentence)
        
        # Word embedding for question
        query = embedding_model.encode(segmented_question)

        # Word embedding for options
        options_embedding = np.zeros((len(segmented_options), 768))
        for index, segmented_option in enumerate(segmented_options):
            options_embedding[index] = embedding_model.encode(segmented_option)
        
        # Calculate similarity level between queries and embedding of top candidate sentences
        idx = cosine_similarity([query], top_k_embedding).argmax()
        # Get the final sentence in the top k candidates with the most similarity to the query
        answer = segmented_top_k[idx]
        
        # Calculate similarity level between options and the final sentence to choose the the final option for the query
        scores = cosine_similarity(np.array([embedding_model.encode(answer)]), options_embedding) 
        max_idx = scores.argmax()
        
        predictions = np.zeros(scores.shape[1], dtype=int)
        
        predictions[max_idx] = 1
        
        res["id"].append(test["id"][i])
        res["answer"].append("".join([str(pred) for pred in predictions]))
        
        # print(f"Reference: {answer}")
        # print(f"Answer: {options[i][max_idx]}")
        
    df = pd.DataFrame(columns=['id', 'answer'], data=res)
    
    return df

In [37]:
# Initialize models
retrieval_model = BM25()
embedding_model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder', cache_folder="./cache")

import py_vncorenlp
# py_vncorenlp.download_model(save_dir='./cache/vncorenlp')
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/space/hotel/taile/kalapa/src/cache/vncorenlp')

2023-10-29 14:24:20 INFO  WordSegmenter:24 - Loading Word Segmentation model


In [40]:
df = inference(idx_ls, test, corpus, retrieval_model, embedding_model, rdrsegmenter)
df

100%|██████████| 100/100 [00:18<00:00,  5.33it/s]


Unnamed: 0,id,answer
0,level3_1,0100
1,level3_2,0010
2,level3_5,0010
3,level3_13,10
4,level3_14,0100
...,...,...
95,level4_4,1000
96,level4_9,0010
97,level4_27,0001
98,level4_28,1000


In [41]:
df.to_csv("/space/hotel/taile/kalapa/src/submission.csv", index=False)