In [10]:
import textwrap
import json
import re
from tqdm import tqdm
from langchain_google_genai import GoogleGenerativeAI

# ============ CONFIG ============
file_path = r"C:\Users\vinays\Desktop\BANG-RAG\data\all_pdfs_text.txt"  # input text file
output_file = "triplets.json"  # output JSON file
GEMINI_API_KEY = "AIzaSyCYFHXTbRVvYWWw1284Vhmxi14I5PFMMZ8" # üîí Replace with your valid key
max_chunk_chars = 6000
# =================================

# --- read the extracted text ---
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

chunks = textwrap.wrap(text[:100000], max_chunk_chars)
print(f"üìö Total chunks: {len(chunks)}")

# --- initialize Gemini model ---
llm = GoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=GEMINI_API_KEY)

all_triplets = []

# --- process each chunk ---
for i, chunk in enumerate(tqdm(chunks, desc="Extracting triples"), start=1):
    prompt = f"""
    You are an expert information extractor.

    Extract all factual triples (subject, relation, object) from the text below.
    Each triple must describe a concrete relationship or fact.
    Output strictly in valid JSON format (UTF-8), as a list of objects.
    
    Each object must have:
    - "subject": entity name (string)
    - "relation": relationship (string)
    - "object": related entity (string)

    Example:
    [
      {{"subject": "Bangalore", "relation": "is located in", "object": "Karnataka"}},
      {{"subject": "BBMP", "relation": "manages", "object": "Waste Management"}}
    ]

    Text:
    {chunk}
    """

    try:
        resp = llm.invoke(prompt)
        content = str(resp).strip()

        # Try to parse JSON directly
        try:
            triples = json.loads(content)
        except json.JSONDecodeError:
            # Try to extract a valid JSON array if model added text
            json_match = re.search(r'\[.*\]', content, re.DOTALL)
            if json_match:
                try:
                    triples = json.loads(json_match.group())
                except Exception as e:
                    print(f"‚ö†Ô∏è Chunk {i}: JSON parse error after cleanup: {e}")
                    continue
            else:
                print(f"‚ö†Ô∏è Chunk {i}: No valid JSON found, skipping.")
                continue

        # Validate and collect
        if isinstance(triples, list):
            all_triplets.extend(triples)
            print(f"‚úÖ Chunk {i}: Extracted {len(triples)} triples.")
        else:
            print(f"‚ö†Ô∏è Chunk {i}: Output not a JSON list, skipping.")

    except Exception as e:
        print(f"‚ö†Ô∏è Error processing chunk {i}: {e}")
        continue

# --- save all triples ---
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_triplets, f, indent=2, ensure_ascii=False)

print(f"\n‚úÖ Successfully saved {len(all_triplets)} triples to {output_file}")


üìö Total chunks: 17


Extracting triples:   6%|‚ñå         | 1/17 [01:53<30:20, 113.81s/it]

‚úÖ Chunk 1: Extracted 68 triples.


Extracting triples:  12%|‚ñà‚ñè        | 2/17 [03:35<26:36, 106.41s/it]

‚úÖ Chunk 2: Extracted 114 triples.


Extracting triples:  18%|‚ñà‚ñä        | 3/17 [04:52<21:45, 93.21s/it] 

‚úÖ Chunk 3: Extracted 79 triples.


Extracting triples:  24%|‚ñà‚ñà‚ñé       | 4/17 [05:58<17:51, 82.44s/it]

‚úÖ Chunk 4: Extracted 93 triples.


Extracting triples:  29%|‚ñà‚ñà‚ñâ       | 5/17 [07:08<15:33, 77.83s/it]

‚úÖ Chunk 5: Extracted 120 triples.


Extracting triples:  35%|‚ñà‚ñà‚ñà‚ñå      | 6/17 [08:20<13:55, 75.96s/it]

‚úÖ Chunk 6: Extracted 103 triples.


Extracting triples:  41%|‚ñà‚ñà‚ñà‚ñà      | 7/17 [09:22<11:54, 71.48s/it]

‚úÖ Chunk 7: Extracted 71 triples.


Extracting triples:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 8/17 [11:19<12:51, 85.75s/it]

‚úÖ Chunk 8: Extracted 90 triples.


Extracting triples:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 9/17 [14:13<15:09, 113.64s/it]

‚úÖ Chunk 9: Extracted 70 triples.


Extracting triples:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 10/17 [16:04<13:08, 112.69s/it]

‚úÖ Chunk 10: Extracted 90 triples.


Extracting triples:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 11/17 [17:09<09:48, 98.01s/it] 

‚úÖ Chunk 11: Extracted 85 triples.


Extracting triples:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 12/17 [18:23<07:34, 90.87s/it]

‚úÖ Chunk 12: Extracted 67 triples.


Extracting triples:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 13/17 [19:44<05:51, 87.87s/it]

‚úÖ Chunk 13: Extracted 64 triples.


Extracting triples:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 14/17 [21:12<04:23, 87.85s/it]

‚úÖ Chunk 14: Extracted 154 triples.


Extracting triples:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 15/17 [22:27<02:47, 83.80s/it]

‚úÖ Chunk 15: Extracted 91 triples.


Extracting triples:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 16/17 [24:28<01:35, 95.04s/it]

‚úÖ Chunk 16: Extracted 88 triples.


Extracting triples: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17/17 [25:10<00:00, 88.87s/it]

‚úÖ Chunk 17: Extracted 51 triples.

‚úÖ Successfully saved 1498 triples to triplets.json





In [11]:
!pip install networkx pyvis


Defaulting to user installation because normal site-packages is not writeable
Collecting networkx
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting ipython>=5.3.0 (from pyvis)
  Downloading ipython-9.6.0-py3-none-any.whl.metadata (4.4 kB)
Collecting jinja2>=2.9.6 (from pyvis)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading jsonpickle-4.1.1-py3-none-any.whl.metadata (8.1 kB)
Collecting colorama (from ipython>=5.3.0->pyvis)
  Using cached colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting decorator (from ipython>=5.3.0->pyvis)
  Downloading decorator-5.2.1-py3-none-any.whl.metadata (3.9 kB)
Collecting ipython-pygments-lexers (from ipython>=5.3.0->pyvis)
  Downloading ipython_pygments_lexers-1.1.1-py3-none-any.whl.metadata (1.1 kB)
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Downloading jedi-0.19.


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Python314\python.exe -m pip install --upgrade pip


In [16]:
import json
import networkx as nx
from pyvis.network import Network

# ============ CONFIG ============
triplets_file = "triplets.json"        # Input from previous step
output_html = "knowledge_graph.html"   # Interactive output file
# =================================

# --- load triples ---
with open(triplets_file, "r", encoding="utf-8") as f:
    triples = json.load(f)

print(f"üìä Loaded {len(triples)} triples.")

# --- initialize directed graph ---
G = nx.DiGraph()

# --- add nodes and edges ---
for t in triples:
    subj = t.get("subject", "").strip()
    rel = t.get("relation", "").strip()
    obj = t.get("object", "").strip()

    if subj and obj and rel:
        G.add_node(subj, title=subj)
        G.add_node(obj, title=obj)
        G.add_edge(subj, obj, label=rel)

print(f"‚úÖ Graph built with {len(G.nodes())} nodes and {len(G.edges())} edges.")

# --- visualize using PyVis ---
net = Network(
    height="800px",
    width="100%",
    bgcolor="#0d1117",
    font_color="white",
    directed=True,
    notebook=False
)

# Convert NetworkX graph ‚Üí PyVis graph
net.from_nx(G)

# Optional ‚Äî adjust physics and layout for clarity
net.repulsion(
    node_distance=120,
    central_gravity=0.33,
    spring_length=110,
    spring_strength=0.10,
    damping=0.95
)

# Save the visualization
net.write_html(output_html)
print(f"üåê Interactive knowledge graph saved to {output_html}")
print("üëâ Open it in your browser to explore.")


üìä Loaded 1498 triples.
‚úÖ Graph built with 2104 nodes and 1488 edges.
üåê Interactive knowledge graph saved to knowledge_graph.html
üëâ Open it in your browser to explore.
