In [1]:
import json
import logging
import torch
import networkx as nx
import matplotlib.pyplot as plt
import  transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from builder import CausalGraphBuilder  # Imports the class from your uploaded builder.py

# Configure logging to see progress
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check for GPU (Strongly recommended for LLMs)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cpu":
    print("WARNING: Running an LLM on CPU will be very slow. A GPU is highly recommended.")

Using device: cpu


In [2]:
import os
import re
from openai import OpenAI

class OpenRouterLLM:
    def __init__(self, 
                 model: str = "deepseek/deepseek-r1:free", 
                 api_key: str = None, 
                 site_url: str = None, 
                 app_name: str = None):
        """
        Initialize the OpenRouter/DeepSeek client.
        
        Args:
            model: The OpenRouter model ID (default: deepseek/deepseek-r1:free)
            api_key: Your OpenRouter API key. If None, looks for OPENROUTER_API_KEY env var.
        """
        # Initialize the client pointing to OpenRouter
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=api_key or os.getenv("OPENROUTER_API_KEY"),
        )
        self.model = model
        
        # Optional headers required by OpenRouter for rankings
        self.extra_headers = {}
        if site_url:
            self.extra_headers["HTTP-Referer"] = site_url
        if app_name:
            self.extra_headers["X-Title"] = app_name

    def generate(self, prompt: str, temperature: float = 0.1, json_mode: bool = False) -> str:
        """
        Generates text based on the prompt using OpenRouter. 
        Matches the signature required by builder.py.
        """
        try:
            # Prepare messages
            messages = [
                {
                    "role": "system", 
                    "content": "You are a specialized assistant that extracts causal graphs. You answer strictly in JSON."
                },
                {"role": "user", "content": prompt}
            ]

            # Make the API call
            completion = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
                extra_headers=self.extra_headers,
                temperature=temperature,
                # Note: 'json_object' mode ensures valid JSON, but not all free models support it.
                # If the specific DeepSeek model supports it, uncomment the line below:
                # response_format={"type": "json_object"} if json_mode else None
            )

            response_content = completion.choices[0].message.content

            # --- DEEPSEEK R1 SPECIFIC CLEANING ---
            # DeepSeek R1 often includes "Chain of Thought" reasoning inside <think> tags.
            # We must remove this, otherwise the JSON parser in builder.py will fail.
            cleaned_content = re.sub(r'<think>.*?</think>', '', response_content, flags=re.DOTALL).strip()
            
            return cleaned_content

        except Exception as e:
            print(f"Error generating with OpenRouter: {e}")
            return "[]"  # Return empty JSON array on failure to prevent crash

In [4]:
input_file = "wiki_math_knowledge_base_api.json"

try:
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Extract just the raw text content for the builder
    # The builder expects a list of strings
    documents = [item.get("raw_text", "") for item in data if item.get("raw_text")]

    print(f"Successfully loaded {len(documents)} documents.")
    # Preview the first document
    print(f"Preview (first 200 chars): {documents[0][:200]}...")

except FileNotFoundError:
    print(f"Error: Could not find '{input_file}'. Please make sure it is in the same folder.")

Successfully loaded 2041 documents.
Preview (first 200 chars): A triangle is a polygon with three corners and three sides, one of the basic shapes in geometry. The corners, also called vertices, are zero-dimensional points while the sides connecting them, also ca...


In [9]:
from builder import CausalGraphBuilder

# 1. Initialize the interface
# Make sure you have set os.environ["OPENROUTER_API_KEY"] or pass the key directly
llm = OpenRouterLLM(
    api_key="",  # <--- FIXED: Added missing comma here
    model="deepseek/deepseek-r1-distill-llama-70b:free" 
)

# 2. Pass it to the builder
builder = CausalGraphBuilder(
    extractor_method="llm", 
    llm_interface=llm
)

# 3. Process your text
# FIXED: You previously passed '[input_file]' (the filename string). 
# You must pass 'documents', which is the list of text strings you loaded in Cell 4.
# We slice [:5] to test first, so you don't hit Rate Limits (429) immediately.
print(f"Processing 5 out of {len(documents)} documents for testing...")
builder.index_documents(documents)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
# 1. Save internal JSON format
builder.save("causal_math_graph_llm_2.json")

# 2. Export to GraphML (Standard format for network analysis tools)
G = builder.get_graph()
nx.write_graphml(G, "causal_math_graph_llm_2.graphml")

print("Files saved: 'causal_math_graph_llm_2.json' and 'causal_math_graph_llm.graphml'")

In [None]:
# Generate interactive HTML visualization
html_path = builder.visualize_graph(
    output_path="llm_graph_viz.html",
    format="html",
    title="Math Knowledge Graph (LLM Generated)"
)
print(f"Interactive visualization saved to: {html_path}")

# Display static plot in notebook
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(G, k=0.15, iterations=20, seed=42)

# Draw nodes and edges
nx.draw_networkx_nodes(G, pos, node_size=100, node_color='skyblue', alpha=0.7)
nx.draw_networkx_edges(G, pos, alpha=0.3, arrows=True)

# Draw labels for top nodes (to avoid clutter)
# Only label nodes with high degree
degrees = dict(G.degree())
top_nodes = {n for n, d in degrees.items() if d > 1}
labels = {n: n for n in top_nodes}

nx.draw_networkx_labels(G, pos, labels=labels, font_size=8, font_color='black')

plt.title("Causal Graph Preview (LLM Extracted)")
plt.axis('off')
plt.show()