In [1]:
import requests
from bs4 import BeautifulSoup, NavigableString
import re
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import tkinter as tk
from tkinter import messagebox
from tabulate import tabulate

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
def get_paragraph_text(p):
    paragraph_text = ''
    for tag in p.children:
        if isinstance(tag, NavigableString):
            paragraph_text += tag.string
        else:
            paragraph_text += tag.text
    
    return paragraph_text

In [3]:
def get_wiki_extract(url):
    page = requests.get(url)    
    soup = BeautifulSoup(page.content , 'html.parser')
    headers = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
    wiki_extract = []
    for tag in soup.find_all():
        if tag.name in headers and tag.text != 'Contents':
            p = ''
            for ne in tag.next_elements:
                if ne.name == 'p':
                    p = p + get_paragraph_text(ne)
                if ne.name in headers:
                    break
            if p != '':
                section = [re.sub(r'\[\d+\]|\[edit\]', '', tag.text), tag.name, re.sub(r'\[\d+\]|\[edit\]', '', p)]
                wiki_extract.append(section)
    return wiki_extract

In [4]:
def get_final_answer(question, url):
    answers = get_answers(question, url)
    df = pd.DataFrame(answers)
    n_sections = len(df)
    if n_sections <= 1:
        df_answers = df.nlargest(n_sections, 'score')
    else:
        df_answers = df.nlargest(1, 'score')

    for index, row in df_answers.iterrows():
        return row['answer']

In [5]:
def get_answers(question, url):
    #model_name = "deepset/roberta-base-squad2"
    model_name = "PremalMatalia/roberta-base-best-squad2"
    #model_name = "deepset/deberta-v3-large-squad2"
    #model = AutoModelForQuestionAnswering.from_pretrained("./models/bert/bbu_squad2") 
    question_answerer = pipeline("question-answering", model=model_name)
    wiki_extract = get_wiki_extract(url)    
    answers = []
    for section in wiki_extract:
        result = question_answerer(question=question, context=section[2])
        answer = {'title': section[0], 'title_tag': section[1], 'paragraph': section[2], **result }
        answers.append(answer)
    return answers