In [1]:
from bm25_utils import BM25Gensim
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
tqdm.pandas()

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

AUTH_TOKEN = "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"


## **1. Load data & model**

In [2]:
PROCESSED_PATH = "./processed"

In [3]:
df_windows = pd.read_csv(f"{PROCESSED_PATH}/corpus_clean.csv")
df_windows = df_windows.fillna("NaN")

## **2. Retriever **

In [4]:
# bm25 ranking
bm25_model = BM25Gensim(f"{PROCESSED_PATH}/outputs/bm25")

In [5]:
# cosine similarity
import torch
from sentence_transformers import util
from transformers import AutoTokenizer, AutoModel
from text_utils import preprocess

model_name = 'nguyenvulebinh/vi-mrc-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, token=AUTH_TOKEN)
model = AutoModel.from_pretrained(model_name, token=AUTH_TOKEN)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def embed_passage(passages, tokenizer, model, device='cpu'):
    # Tokenize sentences
    encoded_input = tokenizer(passages, padding=True, truncation=True, return_tensors='pt')

    model = model.to(device)
    encoded_input.to(device)
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    passage_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return passage_embeddings

def similarity_score(question, all_passage, tokenizer, model, device='cpu'):
    process_paragraphs = [preprocess(doc) for doc in all_passage]
    passage_embeddings = embed_passage(process_paragraphs, tokenizer, model, device)

    question_embedding = embed_passage([question], tokenizer, model, device)

    cos_scores = util.cos_sim(question_embedding, passage_embeddings)[0]
    return cos_scores.to('cpu').numpy()

[nltk_data] Downloading package punkt to /home/dung/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at nguyenvulebinh/vi-mrc-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from rank_bm25 import BM25Okapi

titles = list(set([x for x in df_windows['title'].values]))
tokenized_titles = [preprocess(x).lower().split() for x in titles]
bm25_title = BM25Okapi(tokenized_titles)
title_indices = list(range(len(titles)))

In [7]:
question = "Hương đang mang thai và lo lắng mình có thể gặp phải rau tiền đạo. Hương có thể kiểm tra phát hiện bệnh này từ tuần thứ mấy của thai kỳ?"
# bm25_title.get_top_n(tokenized_question, titles, n=5)

def get_topk_titles_with_score(question):
  tokenized_question = preprocess(question).lower().split()
  indices = bm25_title.get_top_n(tokenized_question, title_indices, n=5)
  scores = bm25_title.get_batch_scores(tokenized_question, indices)
  results = [[titles[i], s] for i, s in zip(indices, scores)]
  return results

get_topk_titles_with_score(question)

[['Rau tiền đạo', 15.791215718696368],
 ['Tăng huyết áp thai kỳ', 13.2726289616419],
 ['Đái tháo đường thai kỳ', 13.2726289616419],
 ['Mang thai ngoài tử cung', 12.532570687265371],
 ['Rau bám mép có nguy hiểm không', 12.308922590612458]]

In [8]:
from functools import partial

similarity_score_shorted = partial(similarity_score, tokenizer=tokenizer, model=model, device=device)

def get_corpus(question):
  #Bm25 retrieval for top200 candidates
  query = preprocess(question).lower()
  top_n, bm25_scores = bm25_model.get_topk(query, topk=500)
  titles_with_scores = get_topk_titles_with_score(query)
  score_map = {k: v for k, v in titles_with_scores}

  filtered_indices = [i for i, v in enumerate(top_n) if str(df_windows.title.values[v]) in score_map]
  top_n, bm25_scores = top_n[filtered_indices], bm25_scores[filtered_indices]
  # print(score_map)
  
  titles = [preprocess(df_windows.title.values[i]) for i in top_n]
  texts = [preprocess(df_windows.text.values[i]).lower() for i in top_n]
  
  # Reranking with for top10
  # question = preprocess(question)
  ranking_texts = similarity_score_shorted(query, texts)
  ranking_titles = np.array([score_map[s] for s in titles])
  ranking_scores = ranking_texts*ranking_texts * bm25_scores * ranking_titles

  best_idxs = np.argsort(ranking_scores)[-10:]
  ranking_scores = np.array(ranking_scores)[best_idxs]
  texts = np.array(texts)[best_idxs]
  titles = np.array(titles)[best_idxs]

  return texts, ranking_scores

# get_corpus(question)

## ** 3. Predict**

In [19]:
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer

id2label = {
    0: 'False',
    1: 'True',
}

label2id = {
    'False': 0,
    'True': 1,
}

num_labels = len(id2label)

class QAEnsembleModel(nn.Module):
    def __init__(self, model_checkpoint, device="cpu"):
        super(QAEnsembleModel, self).__init__()

        self.tokenizer = AutoTokenizer.from_pretrained(
            model_checkpoint,
            num_labels=num_labels,
            label2id=label2id,
            id2label=id2label,
            use_fast=True,
            cls_token="<s>",
            sep_token="</s>"
        )

        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_checkpoint,
            num_labels=num_labels
        ).to(device)

    def forward(self, question, choices, texts, ranking_scores=None):
        if ranking_scores is None:
            ranking_scores = np.ones((len(texts),))

        best_positive_index = 0
        best_positive_score = 0
        all_choices_answers = ["0"]*len(choices)
        for idx, c in enumerate(choices):
            if c.split('.', 1)[0] in ["A", "B", "C", "D", "E", "F"]:
                c = c[2:].strip()

            # answers = []
            # answer_scores = []
            for text, score in zip(texts, ranking_scores):
                prompt = f"<s>{text}</s>{question}</s>{c}</s>"
                model_inputs = self.tokenizer(
                    prompt,
                    # padding="max_length",
                    # max_length=256,
                    # truncation=True,
                    return_tensors="pt"
                )
                outputs = self.model(**model_inputs)
                prediction = torch.argmax(outputs[0], axis=1).item()
                _l = outputs[0].detach().numpy()[0] * score
                # {0,1}, score
                # answers.append(str(prediction))
                # answer_scores.append(_l[prediction])
                confident_score = _l[1] - _l[0]
                if confident_score > best_positive_score:
                    best_positive_score = confident_score
                    best_positive_index = idx
                # prioritize positive answers
                if prediction == 1:
                    all_choices_answers[idx] = str(prediction)
                    break
            # find best choices
            # best_answers_idx = np.argmax(np.array(answer_scores))
            # choice_answer = answers[best_answers_idx]
            # all_choices_answers.append(choice_answer)
        
        # do some trick to correct answer, each question have atleast one correct choice :)))
        if '1' not in all_choices_answers:
            all_choices_answers[best_positive_index] = "1"
        answer = "".join(all_choices_answers)

        return answer


In [20]:
# from qa_model import QAEnsembleModel

qa_model = QAEnsembleModel("./model/fine_tuned_model/chieunq/xlm-r-base-uit-viquad/", device=device)

In [21]:
texts, ranking_scores = get_corpus(question)
choices = ["A. Tuần 10", "B.Tuần 20", "C. Tuần 30", "D. Tuần 40" ]
answer = qa_model(question, choices, texts, ranking_scores=ranking_scores)
print(answer)

0100
