In [1]:
import re
import urllib.parse
from ddgs import DDGS          # package name is 'ddgs' (duckduckgo_search renamed)
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import numpy as np
import time

In [2]:
pip install ddgs requests beautifulsoup4 sentence-transformers numpy 

Note: you may need to restart the kernel to use updated packages.


In [3]:
import re
import urllib.parse
from ddgs import DDGS          # package name is 'ddgs' (duckduckgo_search renamed)
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import numpy as np
import time

In [4]:
SEARCH_RESULTS = 6        # How many URLs to check
PASSAGES_PER_PAGE = 4     # How many passages to pull from each URL
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Fast, high-quality model
TOP_PASSAGES = 5          # How many relevant passages to use for the summary
SUMMARY_SENTENCES = 3     # How many sentences in the final summary
TIMEOUT = 8               # How long to wait for a webpage to load

In [5]:
def unwrap_ddg(url):
    """If DuckDuckGo returns a redirect wrapper, extract the real URL."""
    try:
        parsed = urllib.parse.urlparse(url)
        if "duckduckgo.com" in parsed.netloc:
            qs = urllib.parse.parse_qs(parsed.query)
            uddg = qs.get("uddg")
            if uddg:
                return urllib.parse.unquote(uddg[0])
    except Exception:
        pass
    return url

def search_web(query, max_results=SEARCH_RESULTS):
    """Search the web and return a list of URLs."""
    urls = []
    with DDGS() as ddgs:
        for r in ddgs.text(query, max_results=max_results):
            url = r.get("href") or r.get("url")
            if not url:
                continue
            url = unwrap_ddg(url) # Clean up DDG redirect links
            urls.append(url)
    return urls

In [6]:
def fetch_text(url, timeout=TIMEOUT):
    """Fetch and clean text content from a URL."""
    headers = {"User-Agent": "Mozilla/5.0 (research-agent)"}
    try:
        r = requests.get(url, timeout=timeout, headers=headers, allow_redirects=True)
        if r.status_code != 200:
            return ""
        
        soup = BeautifulSoup(r.text, "html.parser")

        # Remove noisy tags
        for tag in soup(["script", "style", "noscript", "header", "footer", "svg", "iframe", "nav", "aside"]):
            tag.extract()

        # Try paragraphs first
        paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
        text = " ".join([p for p in paragraphs if p])

        # If empty, fall back to divs with a lot of text
        if not text.strip():
            divs = [d.get_text(" ", strip=True) for d in soup.find_all("div")]
            div_texts = [d for d in divs if len(d.split()) > 20]  # skip tiny divs
            text = " ".join(div_texts)

        # If still empty, try meta description or title
        if not text.strip():
            meta = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property": "og:description"})
            if meta and meta.get("content"):
                text = meta["content"].strip()
            elif soup.title and soup.title.string:
                text = soup.title.string.strip()

        return re.sub(r"\s+", " ", text).strip()
        
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return ""


In [7]:
def chunk_passages(text, max_words=120):
    """Split long text into smaller passages."""
    words = text.split()
    if not words:
        return []
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i : i + max_words]
        chunks.append(" ".join(chunk))
        i += max_words
    return chunks

def split_sentences(text):
    """A simple sentence splitter."""
    parts = re.split(r'(?<=[.!?])\s+', text)
    return [p.strip() for p in parts if p.strip()]
class ShortResearchAgent:
    def __init__(self, embed_model=EMBEDDING_MODEL):
        print(f"Loading embedder: {embed_model}...")
        self.embedder = SentenceTransformer(embed_model)

    def run(self, query):
        start = time.time()
        out = {
            "query": query,
            "passages": [],
            "summary": "",
            "time": 0.0
        }

        # --- 1. Search ---
        urls = search_web(query)
        print(f"Found {len(urls)} urls.")

        # --- 2. Fetch & Chunk ---
        docs = []
        for u in urls:
            txt = fetch_text(u)
            if not txt:
                print(f"No text fetched from {u}")
                continue
            chunks = chunk_passages(txt, max_words=120)
            for c in chunks[:PASSAGES_PER_PAGE]:
                docs.append({"url": u, "passage": c})

        if not docs:
            print("No documents could be fetched. Returning empty result.")
            out["time"] = time.time() - start
            return out

        # --- 3. Embed passages ---
        texts = [d["passage"] for d in docs]
        emb_texts = self.embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
        q_emb = self.embedder.encode([query], convert_to_numpy=True)[0]

        # --- 4. Rank passages ---
        def cosine(a, b):
            return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)

        sims = [cosine(e, q_emb) for e in emb_texts]
        top_idx = np.argsort(sims)[::-1][:TOP_PASSAGES]
        top_passages = [{"url": docs[i]["url"], "passage": docs[i]["passage"], "score": float(sims[i])} for i in top_idx]
        out["passages"] = top_passages

        # --- 5. Extractive Summary ---
        sentences = []
        for tp in top_passages:
            for s in split_sentences(tp["passage"]):
                sentences.append({"sent": s, "url": tp["url"]})

        if sentences:
            sent_texts = [s["sent"] for s in sentences]
            sent_embs = self.embedder.encode(sent_texts, convert_to_numpy=True, show_progress_bar=False)
            sent_sims = [cosine(e, q_emb) for e in sent_embs]

            top_sent_idx = np.argsort(sent_sims)[::-1][:SUMMARY_SENTENCES]
            chosen = [sentences[idx] for idx in top_sent_idx]

            # De-duplicate
            seen = set()
            lines = []
            for s in chosen:
                key = s["sent"].lower()[:80]
                if key in seen:
                    continue
                seen.add(key)
                lines.append(f"{s['sent']} (Source: {s['url']})")
            out["summary"] = " ".join(lines)
        else:
            out["summary"] = "No summary could be generated."

        out["time"] = time.time() - start
        return out


In [8]:
if __name__ == "__main__":
    agent = ShortResearchAgent()
    q = "What are the new rules on H1B?"
    
    print(f"Running query: {q}\n")
    out = agent.run(q)

    # ---- Debugging ----
    print("DEBUG OUT:", out)
    print("TYPE:", type(out))
    # -------------------

    if not out:
        print("\nAgent returned None. No results.")
    elif isinstance(out, dict) and "passages" in out:
        print("\nTop passages:")
        for p in out["passages"]:
            print(f"- score {p['score']:.3f} src {p['url']}\n  {p['passage'][:200]}...\n")

        print("--- Extractive summary ---")
        print(out.get("summary", "No summary returned."))
        print("--------------------------")
        print(f"\nDone in {out.get('time', -1):.1f}s")
    else:
        print("\nUnexpected agent output format:")
        print(out)


Loading embedder: sentence-transformers/all-MiniLM-L6-v2...
Running query: What are the new rules on H1B?

Found 6 urls.
DEBUG OUT: {'query': 'What are the new rules on H1B?', 'passages': [{'url': 'https://www.uscis.gov/working-in-the-united-states/temporary-workers/h-1b-specialty-occupations/h-1b-electronic-registration-process', 'passage': 'selection process to select unique beneficiaries based on properly submitted electronic registrations. If we select the unique beneficiary, then each registrant that registered for that beneficiary receives a registration selection notice and may file an H-1B cap-subject petition on their behalf. Selections take place after the initial registration period closes, so you do not need to register on the day the initial registration period opens. You can only file an H-1B cap-subject petition if you receive a selection notice for the beneficiary of the petition. FY 2026 H-1B Cap Process Update We selected enough unique beneficiaries during the initial