<a href="https://colab.research.google.com/github/varun-k1601/kautilya-OA/blob/main/Kautilya_Task_2_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers scikit-learn python-dateutil




In [None]:
%%writefile narrative_builder.py
import json
import argparse
from typing import List, Dict, Any, Tuple
from collections import Counter, defaultdict

import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from dateutil import parser as date_parser


def load_articles(path: str) -> List[Dict[str, Any]]:
    """
    Loads the news dataset.

    Handles:
      - top-level JSON array
      - top-level JSON object with one or more list fields
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    articles: List[Dict[str, Any]] = []

    # Case 1: already a list of articles
    if isinstance(data, list):
        articles = data

    # Case 2: top-level dict, try common container keys
    elif isinstance(data, dict):
        for key in ["articles", "docs", "data", "items", "results", "stories", "news"]:
            if key in data and isinstance(data[key], list):
                articles = data[key]
                break

        # Fallback: collect any list values
        if not articles:
            for v in data.values():
                if isinstance(v, list):
                    articles.extend(v)
                elif isinstance(v, dict):
                    articles.append(v)

    # Filter down to only dict-like items
    articles = [a for a in articles if isinstance(a, dict)]
    return articles


def filter_by_source_rating(articles: List[Dict[str, Any]], min_rating: float = 8.0):
    filtered = []
    for a in articles:
        rating = a.get("source_rating")
        try:
            if rating is not None and float(rating) > min_rating:
                filtered.append(a)
        except (ValueError, TypeError):
            continue
    return filtered


def build_article_text(article: Dict[str, Any]) -> str:
    title = str(article.get("title", "") or "")
    description = str(article.get("description", "") or "")
    content = str(article.get("content", "") or "")

    parts = [title, description, content]
    return "\n".join([p for p in parts if p.strip()])


def embed_articles(model, articles: List[Dict[str, Any]]) -> np.ndarray:
    texts = [build_article_text(a) for a in articles]
    embeddings = model.encode(texts, show_progress_bar=True)
    return np.array(embeddings).astype("float32")


def select_relevant_articles(
    model,
    article_embeddings: np.ndarray,
    articles: List[Dict[str, Any]],
    topic: str,
    max_articles: int = 100,
    similarity_threshold: float = 0.35,
) -> Tuple[List[Dict[str, Any]], np.ndarray]:
    if len(articles) == 0:
        dim = article_embeddings.shape[1] if article_embeddings.size else 0
        return [], np.zeros((0, dim), dtype="float32")

    topic_emb = model.encode([topic]).astype("float32")
    sims = cosine_similarity(topic_emb, article_embeddings)[0]

    idxs = np.argsort(-sims)
    chosen_idxs = []
    for idx in idxs:
        if sims[idx] < similarity_threshold and len(chosen_idxs) >= max_articles:
            break
        chosen_idxs.append(idx)

    chosen_articles = [articles[i] for i in chosen_idxs]
    chosen_embeddings = article_embeddings[chosen_idxs]
    return chosen_articles, chosen_embeddings


def parse_date(article: Dict[str, Any]):
    for key in ["date", "published_at", "published", "time"]:
        val = article.get(key)
        if not val:
            continue
        try:
            return date_parser.parse(str(val))
        except Exception:
            continue
    return None


def build_timeline(articles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    timeline = []
    for a in articles:
        dt = parse_date(a)
        title = str(a.get("title", "") or "")
        url = str(a.get("url", "") or a.get("link", "") or "")

        if "summary" in a and a["summary"]:
            why = str(a["summary"])
        else:
            why = f"This article contributes context or new details about '{title[:80]}'."

        timeline.append(
            {
                "date": dt.isoformat() if dt else None,
                "headline": title,
                "url": url,
                "why_it_matters": why,
            }
        )

    def sort_key(item):
        return (item["date"] is None, item["date"])

    timeline.sort(key=sort_key)
    return timeline


def build_clusters(
    embeddings: np.ndarray,
    articles: List[Dict[str, Any]],
    n_clusters: int = 4,
) -> List[Dict[str, Any]]:
    if len(articles) == 0:
        return []

    n_clusters = min(n_clusters, len(articles))
    if n_clusters <= 1:
        return [
            {
                "cluster_id": 0,
                "label": "Main Theme",
                "articles": list(range(len(articles))),
            }
        ]

    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(embeddings)

    clusters = defaultdict(list)
    for idx, label in enumerate(labels):
        clusters[int(label)].append(idx)

    cluster_outputs = []
    for label, idxs in clusters.items():
        titles = [str(articles[i].get("title", "") or "") for i in idxs]
        words = []
        for t in titles:
            for w in t.split():
                w = w.strip(".,;:!?'\"()[]{}").lower()
                if len(w) > 3:
                    words.append(w)

        common = [w for w, _ in Counter(words).most_common(5)]
        label_text = " / ".join(common) if common else "Cluster " + str(label)

        cluster_outputs.append(
            {
                "cluster_id": label,
                "label": label_text,
                "articles": idxs,
            }
        )

    return cluster_outputs


def build_graph(articles: List[Dict[str, Any]], timeline: List[Dict[str, Any]]) -> Dict[str, Any]:
    n = len(articles)
    nodes = []
    for i, a in enumerate(articles):
        title = str(a.get("title", "") or "")
        nodes.append(
            {
                "id": i,
                "title": title,
            }
        )

    title_to_idx = {str(a.get("title", "") or ""): i for i, a in enumerate(articles)}

    edges = []

    timeline_idxs = []
    for entry in timeline:
        idx = title_to_idx.get(entry["headline"])
        if idx is not None:
            timeline_idxs.append(idx)

    for i in range(len(timeline_idxs) - 1):
        src = timeline_idxs[i]
        tgt = timeline_idxs[i + 1]
        edges.append(
            {
                "source": src,
                "target": tgt,
                "relation": "builds_on",
            }
        )

    prefix_map = defaultdict(list)
    for i, a in enumerate(articles):
        title = str(a.get("title", "") or "")
        prefix = title[:40]
        prefix_map[prefix].append(i)

    for prefix, idxs in prefix_map.items():
        if len(idxs) > 1:
            base = idxs[0]
            for other in idxs[1:]:
                edges.append(
                    {
                        "source": base,
                        "target": other,
                        "relation": "adds_context",
                    }
                )

    return {"nodes": nodes, "edges": edges}


def build_narrative_summary(topic: str, articles: List[Dict[str, Any]]) -> str:
    if not articles:
        return f"No high-rated articles found related to topic '{topic}'."

    titles = [str(a.get("title", "") or "") for a in articles[:8]]
    summary_lines = [
        f"This narrative explores the topic '{topic}' using {len(articles)} high-rated news articles.",
        "Key storylines include:",
    ]
    for t in titles:
        if t:
            summary_lines.append(f"- {t}")

    summary_lines.append(
        "Together, these articles provide a chronological and thematic view of how this topic evolved in the news."
    )
    return " ".join(summary_lines)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_path", type=str, required=True)
    parser.add_argument("--topic", type=str, required=True)
    parser.add_argument("--min_source_rating", type=float, default=8.0)
    parser.add_argument("--max_articles", type=int, default=100)
    parser.add_argument("--n_clusters", type=int, default=4)
    args = parser.parse_args()

    articles = load_articles(args.data_path)
    filtered = filter_by_source_rating(articles, args.min_source_rating)

    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embed_articles(model, filtered)

    relevant_articles, relevant_embeddings = select_relevant_articles(
        model,
        embeddings,
        filtered,
        topic=args.topic,
        max_articles=args.max_articles,
    )

    timeline = build_timeline(relevant_articles)
    clusters = build_clusters(relevant_embeddings, relevant_articles, n_clusters=args.n_clusters)
    graph = build_graph(relevant_articles, timeline)
    narrative_summary = build_narrative_summary(args.topic, relevant_articles)

    output = {
        "narrative_summary": narrative_summary,
        "timeline": timeline,
        "clusters": clusters,
        "graph": graph,
    }

    print(json.dumps(output, indent=2))


if __name__ == "__main__":
    main()


Overwriting narrative_builder.py


In [None]:
!python narrative_builder.py \
  --data_path "/content/14e9e4cc-9174-48da-ad02-abb1330b48fe.json" \
  --topic "AI regulation" \
  --min_source_rating 8.0 \
  --max_articles 80 \
  --n_clusters 4


2025-11-17 06:32:22.515827: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763361142.546440    2113 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763361142.558525    2113 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763361142.582195    2113 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763361142.582253    2113 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763361142.582257    2113 computation_placer.cc:177] computation placer alr

In [None]:
!python narrative_builder.py \
  --data_path "/content/14e9e4cc-9174-48da-ad02-abb1330b48fe.json" \
  --topic "Jubilee Hills elections" \
  --min_source_rating 8.0 \
  --max_articles 80 \
  --n_clusters 4

2025-11-17 07:08:28.829753: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763363308.889741   10792 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763363308.908126   10792 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763363308.962660   10792 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763363308.962747   10792 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763363308.962753   10792 computation_placer.cc:177] computation placer alr

In [None]:
!python narrative_builder.py \
  --data_path "/content/14e9e4cc-9174-48da-ad02-abb1330b48fe.json" \
  --topic "Israel-Iran conflict" \
  --min_source_rating 8.0 \
  --max_articles 80 \
  --n_clusters 4

2025-11-17 07:10:33.260818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763363433.288541   11318 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763363433.297112   11318 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763363433.318319   11318 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763363433.318390   11318 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763363433.318394   11318 computation_placer.cc:177] computation placer alr