In [2]:
from fastapi import FastAPI, Query
from pydantic import BaseModel
from typing import List
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Initialize FastAPI
app = FastAPI()

In [4]:
# Load dataset
news_data = pd.read_csv("NewsArticles/entities_and_relations.csv")
news_data.fillna("", inplace=True)

In [5]:
# Model for search queries
class SearchQuery(BaseModel):
    query: str
    filters: List[str] = []

In [6]:
# Helper to compute cosine similarity
def compute_similarity(query, corpus):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    query_vector = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    return similarity_scores

In [7]:
@app.post("/search")
def search_news(search_query: SearchQuery):
    query = search_query.query
    filters = search_query.filters

    # Apply filters
    filtered_data = news_data
    if filters:
        for f in filters:
            filtered_data = filtered_data[filtered_data["Category"].str.contains(f, case=False)]

    # Compute relevance
    corpus = filtered_data["Content"].tolist()
    filtered_data["similarity"] = compute_similarity(query, corpus)

    # Top 10 results
    results = (
        filtered_data.sort_values(by="similarity", ascending=False)
        .head(10)
        .to_dict(orient="records")
    )
    return {"results": results}