# Bitcoin News Summarization (LangChain + RAG)

This notebook fetches Bitcoin news from the last 24 hours, processes them through a RAG pipeline using LangChain, and outputs an LLM-generated summary of the most important factors driving Bitcoin price movement.

## 1. Setup & Install Dependencies

In [None]:
# Install required packages
!pip install -q langchain langchain-ollama langchain-community chromadb requests duckduckgo-search langgraph

In [None]:
import os
import json
import hashlib
import time
import requests
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pathlib import Path
from datetime import datetime, timedelta, timezone
from IPython.display import display, Markdown

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.tools import Tool
from langchain.agents import create_react_agent, AgentExecutor

## 2. Configuration

In [None]:
# --- Configuration ---
NEWS_LOOKBACK_HOURS = 24
TOP_K_ARTICLES = 10  # Retrieve top-K relevant docs for summarization

# Ollama model configuration (runs locally, no API keys needed for LLM)
OLLAMA_BASE_URL = "http://localhost:11434"  # Default Ollama server
LLM_MODEL = "qwen2.5:7b"
EMBEDDING_MODEL = "nomic-embed-text"

# News API key (optional — falls back to CryptoPanic if not set)
NEWSAPI_KEY = os.environ.get("NEWSAPI_KEY")

# Verification agent cache directory
VERIFICATION_CACHE_DIR = Path("./cache")
VERIFICATION_CACHE_DIR.mkdir(exist_ok=True)

print("Configuration loaded.")
print(f"  News lookback: {NEWS_LOOKBACK_HOURS} hours")
print(f"  Top-K articles for retrieval: {TOP_K_ARTICLES}")
print(f"  LLM: {LLM_MODEL} (via Ollama)")
print(f"  Embeddings: {EMBEDDING_MODEL} (via Ollama)")
print(f"  NEWSAPI_KEY: {'set' if NEWSAPI_KEY else 'not set (will use CryptoPanic fallback)'}")
print(f"  Verification cache: {VERIFICATION_CACHE_DIR.resolve()}")

## 3. Data Collection — News APIs

In [None]:
def fetch_newsapi_articles(api_key: str, lookback_hours: int) -> list[dict]:
    """Fetch Bitcoin news from NewsAPI.org."""
    from_time = datetime.now(timezone.utc) - timedelta(hours=lookback_hours)
    url = "https://newsapi.org/v2/everything"
    params = {
        "q": "bitcoin OR BTC OR cryptocurrency",
        "from": from_time.strftime("%Y-%m-%dT%H:%M:%S"),
        "sortBy": "publishedAt",
        "language": "en",
        "pageSize": 50,
        "apiKey": api_key,
    }
    resp = requests.get(url, params=params, timeout=15)
    resp.raise_for_status()
    data = resp.json()

    articles = []
    for art in data.get("articles", []):
        articles.append({
            "title": art.get("title", ""),
            "description": art.get("description", "") or "",
            "content": art.get("content", "") or art.get("description", "") or "",
            "source": art.get("source", {}).get("name", "Unknown"),
            "publishedAt": art.get("publishedAt", ""),
            "url": art.get("url", ""),
        })
    return articles


def fetch_cryptopanic_articles(lookback_hours: int) -> list[dict]:
    """Fetch Bitcoin news from CryptoPanic (free, no key needed for basic access)."""
    url = "https://cryptopanic.com/api/free/v1/posts/"
    params = {
        "currencies": "BTC",
        "kind": "news",
        "public": "true",
    }
    resp = requests.get(url, params=params, timeout=15)
    resp.raise_for_status()
    data = resp.json()

    cutoff = datetime.now(timezone.utc) - timedelta(hours=lookback_hours)
    articles = []
    for post in data.get("results", []):
        published = post.get("published_at", "")
        if published:
            pub_dt = datetime.fromisoformat(published.replace("Z", "+00:00"))
            if pub_dt < cutoff:
                continue
        articles.append({
            "title": post.get("title", ""),
            "description": post.get("title", ""),
            "content": post.get("title", ""),
            "source": post.get("source", {}).get("title", "CryptoPanic"),
            "publishedAt": published,
            "url": post.get("url", ""),
        })
    return articles

In [None]:
# Fetch articles: try NewsAPI first, fall back to CryptoPanic
articles = []

if NEWSAPI_KEY:
    try:
        articles = fetch_newsapi_articles(NEWSAPI_KEY, NEWS_LOOKBACK_HOURS)
        print(f"Fetched {len(articles)} articles from NewsAPI.")
    except Exception as e:
        print(f"NewsAPI failed: {e}. Trying CryptoPanic fallback...")

if not articles:
    try:
        articles = fetch_cryptopanic_articles(NEWS_LOOKBACK_HOURS)
        print(f"Fetched {len(articles)} articles from CryptoPanic.")
    except Exception as e:
        print(f"CryptoPanic also failed: {e}")

if not articles:
    raise RuntimeError("No articles fetched from any source. Check API keys and connectivity.")

# Show sample
df_articles = pd.DataFrame(articles)
print(f"\nTotal articles: {len(df_articles)}")
df_articles[["title", "source", "publishedAt"]].head(10)

## 3.5 News Verification Agent

Uses a ReAct agent with DuckDuckGo search to cross-validate fetched headlines against other web sources. Articles that can't be corroborated are dropped before entering the RAG pipeline. Results are cached daily to avoid redundant API calls.

In [None]:
# --- Verification Agent Setup ---

# Initialize LLM for the agent
verification_llm = ChatOllama(
    model=LLM_MODEL,
    base_url=OLLAMA_BASE_URL,
    temperature=0.1,
)

# Web search tool (free, no API key required)
search_tool = DuckDuckGoSearchRun()

tools = [
    Tool(
        name="Web_Search",
        func=search_tool.run,
        description="Search the web using DuckDuckGo to find corroborating news sources for a headline.",
    ),
]

# ReAct agent prompt for fact-checking
VERIFICATION_AGENT_PROMPT = PromptTemplate.from_template(
    """You are a fact-checking assistant. Your job is to verify whether a news headline
is reported by other credible sources.

You have access to the following tools:

{tools}

Use the following format:

Question: the headline to verify
Thought: I need to search for this headline to see if other sources report it
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat at most 2 times)
Thought: I now have enough information to make a verdict
Final Answer: a JSON object with "status" and "reason" keys

The "status" must be one of:
- "verified": Multiple sources confirm this news
- "uncertain": Limited info, but no contradictions found
- "unverified": Cannot find corroboration or contradicts known facts

Question: {input}
Thought: {agent_scratchpad}"""
)

# Create the agent
verification_agent = create_react_agent(verification_llm, tools, VERIFICATION_AGENT_PROMPT)
agent_executor = AgentExecutor(
    agent=verification_agent,
    tools=tools,
    verbose=False,
    max_iterations=3,
    handle_parsing_errors=True,
)

print("Verification agent initialized.")

In [None]:
# --- Run Verification (with daily cache) ---

cache_file = VERIFICATION_CACHE_DIR / f"verified_articles_{datetime.now().strftime('%Y-%m-%d')}.json"

if cache_file.exists():
    # Cache hit: load previous results
    with open(cache_file, "r") as f:
        verification_results = json.load(f)
    print(f"Loaded cached verification results from {cache_file.name}")
    print(f"  ({len(verification_results)} articles previously verified)")
else:
    # Cache miss: run verification agent
    print(f"No cache for today. Running verification agent on {len(articles)} articles...")
    verification_results = []
    batch_size = 5

    for i in range(0, len(articles), batch_size):
        batch = articles[i:i + batch_size]
        print(f"  Verifying batch {i // batch_size + 1}/{(len(articles) - 1) // batch_size + 1}...")

        for art in batch:
            headline = art["title"]
            if not headline:
                verification_results.append({
                    "title": headline,
                    "status": "unverified",
                    "reason": "Empty headline",
                })
                continue

            try:
                result = agent_executor.invoke({"input": headline})
                output = result.get("output", "")

                # Parse agent's JSON response
                try:
                    # Try to extract JSON from the output
                    if "{" in output and "}" in output:
                        json_str = output[output.index("{"):output.rindex("}") + 1]
                        verdict = json.loads(json_str)
                    else:
                        verdict = {"status": "uncertain", "reason": output[:200]}
                except (json.JSONDecodeError, ValueError):
                    verdict = {"status": "uncertain", "reason": output[:200]}

                verification_results.append({
                    "title": headline,
                    "status": verdict.get("status", "uncertain"),
                    "reason": verdict.get("reason", "No reason provided"),
                })
            except Exception as e:
                # On agent failure, default to uncertain (don't drop the article)
                verification_results.append({
                    "title": headline,
                    "status": "uncertain",
                    "reason": f"Agent error: {str(e)[:100]}",
                })

        # Brief pause between batches to avoid rate limits
        if i + batch_size < len(articles):
            time.sleep(2)

    print(f"Verification complete for {len(verification_results)} articles.")

In [None]:
# --- Filter Articles & Save Cache ---

# Build a lookup of verification status by title
status_by_title = {r["title"]: r["status"] for r in verification_results}

# Filter: keep only verified or uncertain articles
original_count = len(articles)
articles = [
    art for art in articles
    if status_by_title.get(art["title"], "uncertain") in ("verified", "uncertain")
]

# Count verdicts
n_verified = sum(1 for r in verification_results if r["status"] == "verified")
n_uncertain = sum(1 for r in verification_results if r["status"] == "uncertain")
n_unverified = sum(1 for r in verification_results if r["status"] == "unverified")

# Save cache (only if we actually ran the agent, not loaded from cache)
if not cache_file.exists():
    with open(cache_file, "w") as f:
        json.dump(verification_results, f, indent=2)
    print(f"Cache saved to {cache_file.name}")

# Print summary
print(f"\nVerification Summary:")
print(f"  Verified:   {n_verified}/{original_count}")
print(f"  Uncertain:  {n_uncertain}/{original_count}")
print(f"  Unverified: {n_unverified}/{original_count} (dropped)")
print(f"\n  Articles passed to RAG pipeline: {len(articles)}/{original_count}")

## 4. BTC Price Context

In [None]:
# Fetch last 24h BTC price from yfinance (1-hour interval)
btc = yf.Ticker("BTC-USD")
btc_hist = btc.history(period="2d", interval="1h")

# Calculate metrics
current_price = btc_hist["Close"].iloc[-1]
price_24h_ago = btc_hist["Close"].iloc[-24] if len(btc_hist) >= 24 else btc_hist["Close"].iloc[0]
change_pct = ((current_price - price_24h_ago) / price_24h_ago) * 100
high_24h = btc_hist["Close"].tail(24).max()
low_24h = btc_hist["Close"].tail(24).min()

price_context = {
    "current_price": current_price,
    "change_pct": change_pct,
    "high_24h": high_24h,
    "low_24h": low_24h,
}

print(f"BTC Current Price: ${current_price:,.2f}")
print(f"24h Change: {change_pct:+.2f}%")
print(f"24h High: ${high_24h:,.2f}")
print(f"24h Low: ${low_24h:,.2f}")

## 5. RAG Pipeline

In [None]:
# Step 1: Convert articles to LangChain Document objects
documents = []
for art in articles:
    text = f"{art['title']}\n\n{art['content']}"
    metadata = {
        "source": art["source"],
        "publishedAt": art["publishedAt"],
        "url": art["url"],
        "title": art["title"],
    }
    documents.append(Document(page_content=text, metadata=metadata))

print(f"Created {len(documents)} Document objects.")

# Step 2: Text Splitting
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = text_splitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks.")

# Step 3: Embeddings + Vector Store (ChromaDB, ephemeral/in-memory)
embeddings = OllamaEmbeddings(
    model=EMBEDDING_MODEL,
    base_url=OLLAMA_BASE_URL,
)
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
)
print(f"Vector store created with {vectorstore._collection.count()} vectors.")

# Step 4: Retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": TOP_K_ARTICLES},
)
print(f"Retriever configured (top-{TOP_K_ARTICLES} similarity search).")

## 6. LLM Summarization Chain

In [None]:
# Custom prompt template
SUMMARY_PROMPT_TEMPLATE = """You are a crypto market analyst. Based on the following news articles \
from the last 24 hours, provide:

1. **Market Summary**: 1-2 sentence overview of Bitcoin's movement
2. **Key Drivers**: Top 3-5 factors driving price (bullish/bearish)
3. **Sentiment**: Overall market sentiment (Bullish/Neutral/Bearish)
4. **Notable Events**: Any major events (regulatory, institutional, technical)
5. **Outlook**: Brief forward-looking view based on current news

Context: BTC is currently at ${price}, {change}% in last 24h (High: ${high}, Low: ${low}).

Articles:
{context}

Question: {question}
"""

prompt = PromptTemplate(
    template=SUMMARY_PROMPT_TEMPLATE,
    input_variables=["context", "question"],
    partial_variables={
        "price": f"{current_price:,.2f}",
        "change": f"{change_pct:+.2f}",
        "high": f"{high_24h:,.2f}",
        "low": f"{low_24h:,.2f}",
    },
)

# LLM (Ollama - runs locally)
llm = ChatOllama(
    model=LLM_MODEL,
    base_url=OLLAMA_BASE_URL,
    temperature=0.3,
)

# RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True,
)

print("Summarization chain ready.")

In [None]:
# Run the summarization
query = "Summarize the key Bitcoin news and market drivers from the last 24 hours."
result = qa_chain.invoke({"query": query})

summary_text = result["result"]
source_docs = result["source_documents"]

## 7. Output Display

In [None]:
# Display formatted summary
display(Markdown("# Bitcoin News Summary (Last 24 Hours)\n"))
display(Markdown(summary_text))

In [None]:
# Source articles table
display(Markdown("## Source Articles"))

source_data = []
seen_titles = set()
for doc in source_docs:
    title = doc.metadata.get("title", "N/A")
    if title in seen_titles:
        continue
    seen_titles.add(title)
    source_data.append({
        "Title": title,
        "Source": doc.metadata.get("source", "N/A"),
        "Published": doc.metadata.get("publishedAt", "N/A")[:16],
        "URL": doc.metadata.get("url", ""),
    })

df_sources = pd.DataFrame(source_data)
display(df_sources)

In [None]:
# BTC 24h price chart
fig, ax = plt.subplots(figsize=(12, 5))

plot_data = btc_hist.tail(24)
ax.plot(plot_data.index, plot_data["Close"], color="orange", linewidth=2, label="BTC Price")
ax.fill_between(plot_data.index, plot_data["Low"], plot_data["High"], alpha=0.1, color="orange")

ax.set_title("BTC-USD Last 24 Hours", fontsize=14, fontweight="bold")
ax.set_xlabel("Time (UTC)")
ax.set_ylabel("Price (USD)")
ax.xaxis.set_major_formatter(mdates.DateFormatter("%m/%d %H:%M"))
ax.legend()
ax.grid(True, alpha=0.3)

# Annotate current price
ax.annotate(
    f"${current_price:,.0f}",
    xy=(plot_data.index[-1], current_price),
    xytext=(10, 10),
    textcoords="offset points",
    fontsize=11,
    fontweight="bold",
    color="darkorange",
)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 8. Sentiment Analysis (Bonus)

In [None]:
# Classify each headline as Bullish/Neutral/Bearish using the LLM
SENTIMENT_PROMPT = """Classify the sentiment of each Bitcoin news headline below as \
exactly one of: Bullish, Neutral, or Bearish.

Return ONLY a JSON array of objects with "title" and "sentiment" keys.

Headlines:
{headlines}
"""

# Take up to 20 headlines for classification
headlines_for_sentiment = [art["title"] for art in articles[:20] if art["title"]]
headlines_text = "\n".join(f"- {h}" for h in headlines_for_sentiment)

sentiment_response = llm.invoke(SENTIMENT_PROMPT.format(headlines=headlines_text))

# Parse sentiment results
try:
    # Extract JSON from response
    response_text = sentiment_response.content
    # Handle potential markdown code blocks in response
    if "```json" in response_text:
        response_text = response_text.split("```json")[1].split("```")[0]
    elif "```" in response_text:
        response_text = response_text.split("```")[1].split("```")[0]
    sentiment_results = json.loads(response_text.strip())
    df_sentiment = pd.DataFrame(sentiment_results)
    print(f"Classified {len(df_sentiment)} headlines.")
except (json.JSONDecodeError, IndexError) as e:
    print(f"Failed to parse sentiment response: {e}")
    print("Raw response:", sentiment_response.content[:500])
    df_sentiment = pd.DataFrame(columns=["title", "sentiment"])

In [None]:
# Sentiment distribution pie chart
if not df_sentiment.empty:
    sentiment_counts = df_sentiment["sentiment"].value_counts()

    colors = {
        "Bullish": "#2ecc71",
        "Neutral": "#95a5a6",
        "Bearish": "#e74c3c",
    }
    pie_colors = [colors.get(s, "#bdc3c7") for s in sentiment_counts.index]

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Pie chart
    axes[0].pie(
        sentiment_counts.values,
        labels=sentiment_counts.index,
        colors=pie_colors,
        autopct="%1.0f%%",
        startangle=90,
        textprops={"fontsize": 12},
    )
    axes[0].set_title("News Sentiment Distribution", fontsize=13, fontweight="bold")

    # Price chart with sentiment overlay
    axes[1].plot(plot_data.index, plot_data["Close"], color="orange", linewidth=2)
    axes[1].set_title("BTC Price with Sentiment Context", fontsize=13, fontweight="bold")
    axes[1].set_xlabel("Time (UTC)")
    axes[1].set_ylabel("Price (USD)")

    # Add sentiment annotation
    dominant = sentiment_counts.index[0] if len(sentiment_counts) > 0 else "Neutral"
    bg_color = colors.get(dominant, "#bdc3c7")
    axes[1].axhspan(
        plot_data["Close"].min(), plot_data["Close"].max(),
        alpha=0.08, color=bg_color, label=f"Dominant: {dominant}"
    )
    axes[1].legend(fontsize=11)
    axes[1].xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Display sentiment table
    display(Markdown("### Headline Sentiments"))
    display(df_sentiment)
else:
    print("No sentiment data to display.")