In [13]:
import json
import re
from rank_bm25 import BM25Okapi
from transformers import pipeline


def load_and_clean_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        articles = json.load(file)
    
    cleaned_articles = []
    for article in articles:
        text = article.get('content', '')
        text = re.sub(r'[^\w\s]', '', text)  
        cleaned_articles.append(text)
    return cleaned_articles


def filter_relevant_articles(articles):
    relevant_articles = [article for article in articles if 'Israel' in article  'Hamas' in article]
    if not relevant_articles:
        raise ValueError("No relevant articles found.")
    return relevant_articles


def index_articles(articles):
    if not articles:
        raise ValueError("No relevant articles found.")
    tokenized_articles = [article.split() for article in articles]
    bm25 = BM25Okapi(tokenized_articles)
    return bm25


def answer_question(bm25, articles, question, top_n=5):
    tokenized_question = question.split()
    doc_scores = bm25.get_scores(tokenized_question)
    top_n_indices = doc_scores.argsort()[-top_n:][::-1]
    relevant_articles = [articles[i] for i in top_n_indices]

    qa_pipeline = pipeline("question-answering")
    answers = []
    for article in relevant_articles:
        answer = qa_pipeline({'question': question, 'context': article})
        answers.append(answer['answer'])
    return answers


def main(file_path, question):
    articles = load_and_clean_data(file_path)
    relevant_articles = filter_relevant_articles(articles)
    bm25 = index_articles(relevant_articles)
    answers = answer_question(bm25, relevant_articles, question)
    return answers


file_path = 'news.article.json'  
question = "What happened at the Al-Shifa Hospital?"
try:
    answers = main(file_path, question)
    print(answers)
except ValueError as e:
    print(e)


No relevant articles found.
