In [1]:
!pip install groq pandas


Collecting groq
  Downloading groq-0.34.1-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.34.1-py3-none-any.whl (136 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.0/136.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.34.1


In [2]:
import os, json, re
import pandas as pd
from groq import Groq
from getpass import getpass


In [7]:
api = getpass("Enter GROQ_API_KEY: ")
os.environ["GROQ_API_KEY"] = api

client = Groq(api_key=os.environ["GROQ_API_KEY"])
print(client.models.list())  # check key


Enter GROQ_API_KEY: ··········
ModelListResponse(data=[Model(id='whisper-large-v3', created=1693721698, object='model', owned_by='OpenAI', active=True, context_window=448, public_apps=None, max_completion_tokens=448), Model(id='meta-llama/llama-guard-4-12b', created=1746743847, object='model', owned_by='Meta', active=True, context_window=131072, public_apps=None, max_completion_tokens=1024), Model(id='whisper-large-v3-turbo', created=1728413088, object='model', owned_by='OpenAI', active=True, context_window=448, public_apps=None, max_completion_tokens=448), Model(id='meta-llama/llama-4-maverick-17b-128e-instruct', created=1743877158, object='model', owned_by='Meta', active=True, context_window=131072, public_apps=None, max_completion_tokens=8192), Model(id='moonshotai/kimi-k2-instruct-0905', created=1757046093, object='model', owned_by='Moonshot AI', active=True, context_window=262144, public_apps=None, max_completion_tokens=16384), Model(id='meta-llama/llama-4-scout-17b-16e-instruct',

In [8]:
files = {
    "faculty": "faculty_page.txt",
    "phd": "phd_iiserb_page_clean.txt",
    "postdocs": "postdocs_page_clean.txt",
    "research_groups": "researchGroups_clean.txt",
    "bsms_3rd": "bsms_3rd_year_page_clean.txt",
    "bsms_4th": "bsms_4th_year_page_clean.txt",
    "bsms_5th": "ms_5th_year_page_clean.txt"
}

combined_text = ""

for label, file in files.items():
    with open(file, "r", encoding="utf-8") as f:
        text = f.read().strip()
        combined_text += f"\n\n### PAGE: {label.upper()} ###\n{text}"


In [9]:
print("Loaded pages:", list(files.keys()))
print(combined_text[:1000])


Loaded pages: ['faculty', 'phd', 'postdocs', 'research_groups', 'bsms_3rd', 'bsms_4th', 'bsms_5th']


### PAGE: FACULTY ###
----------------------------------------
Name: Vaibhav Kumar
Role: Assistant Professor
Email(s): vaibhav@iiserb.ac.in
Phone: +91 755 269 2681
Research Areas:
 - Geospatial Artificial Intelligence(GeoAI)
 - 3D GIS
 - Urban Informatics. HomePage
Homepage: https://sites.google.com/view/vaibhavkumar1/home
----------------------------------------

----------------------------------------
Name: Tanmay Basu [Dept. Head]
Role: Assistant Professor
Email(s): hod_dse@iiserb.ac.in,, tanmay@iiserb.ac.in
Phone: +91 755 269 2683
Research Areas:
 - Biomedical Informatics
 - Information Extraction
 - Machine Learning
 - Natural Language Processing (NLP)
 - Text Mining. HomePage
Homepage: https://sites.google.com/view/tanmaybasu/
----------------------------------------

----------------------------------------
Name: Jasabanta Patro
Role: Assistant Professor
Email(s): jpatro@iiserb

adding schema

In [10]:
schema_prompt = """
You are extracting structured entities to build a clean Knowledge Graph for IISER Bhopal.

# =====================================================
#                  JSON NODE SCHEMA
# =====================================================

{
  "Student": {
    "id": "unique student identifier",
    "name": "full name",
    "type": "BS | MS | PhD | PostDoc",
    "roll_no": "official roll number",
    "email": "email ID",
    "department": "department ID",
    "linkedin": "LinkedIn URL",
    "github": "GitHub URL",
    "research_group": "research group ID",
    "guide": "faculty ID"
  },

  "Faculty": {
    "id": "unique faculty identifier",
    "name": "full name",
    "email": "email ID",
    "department": "department ID",
    "position": "Professor | Associate Professor | Assistant Professor",
    "joined_year": "year of joining"
  },

  "ResearchGroup": {
    "id": "unique research group ID",
    "name": "research group name",
    "department": "department ID",
    "head": "faculty ID",
    "website": "homepage URL"
  },

  "Department": {
    "id": "unique department ID",
    "name": "department name",
    "institute": "institute ID",
    "hod": "faculty ID"
  },

  "Institute": {
    "id": "unique institute ID",
    "name": "institute name",
    "director": "director name",
    "location": "city, state",
    "established": "year"
  }
}

# =====================================================
#                RELATIONSHIP SCHEMA
# =====================================================

{
  "belongsTo": {
    "subject": ["Student", "Faculty", "ResearchGroup"],
    "object": ["Department"]
  },
  "partOf": {
    "subject": ["Department"],
    "object": ["Institute"]
  },
  "heads": {
    "subject": ["Faculty"],
    "object": ["Department", "ResearchGroup"]
  },
  "guidedBy": {
    "subject": ["Student"],
    "object": ["Faculty"]
  },
  "memberOf": {
    "subject": ["Student"],
    "object": ["ResearchGroup"]
  }
}

# =====================================================
# OUTPUT FORMAT
# =====================================================

Return ONLY a JSON LIST.
Each item must be:

{
  "node": {...},
  "edges": [
     {"relation": "...", "from": "...", "to": "..."}
  ]
}

No explanation.
"""


Chunk Text

In [11]:
def chunk_text(text, max_len=2000):
    return [text[i:i+max_len] for i in range(0, len(text), max_len)]

chunks = chunk_text(combined_text, 2000)
print("Total chunks:", len(chunks))


Total chunks: 48


Run Groq + Extract KG According to Schema

In [12]:
all_nodes = []
all_edges = []

for idx, chunk in enumerate(chunks):
    print(f"\nProcessing chunk {idx+1}/{len(chunks)}")

    prompt = schema_prompt + "\n\n### TEXT ###\n" + chunk

    try:
        resp = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=2400
        )
    except Exception as e:
        print("API ERROR:", e)
        continue

    out = resp.choices[0].message.content
    match = re.search(r"\[.*\]", out, re.DOTALL)

    if not match:
        print("⚠️ No JSON → skipped")
        continue

    try:
        data = json.loads(match.group(0))
    except:
        print("⚠️ Invalid JSON → skipped")
        continue

    for item in data:
        if "node" in item:
            all_nodes.append(item["node"])
        for e in item.get("edges", []):
            all_edges.append(e)



Processing chunk 1/48

Processing chunk 2/48

Processing chunk 3/48

Processing chunk 4/48

Processing chunk 5/48

Processing chunk 6/48

Processing chunk 7/48

Processing chunk 8/48

Processing chunk 9/48

Processing chunk 10/48

Processing chunk 11/48

Processing chunk 12/48
⚠️ Invalid JSON → skipped

Processing chunk 13/48

Processing chunk 14/48
⚠️ Invalid JSON → skipped

Processing chunk 15/48
⚠️ Invalid JSON → skipped

Processing chunk 16/48
⚠️ Invalid JSON → skipped

Processing chunk 17/48

Processing chunk 18/48

Processing chunk 19/48

Processing chunk 20/48

Processing chunk 21/48

Processing chunk 22/48

Processing chunk 23/48

Processing chunk 24/48

Processing chunk 25/48

Processing chunk 26/48

Processing chunk 27/48

Processing chunk 28/48
⚠️ Invalid JSON → skipped

Processing chunk 29/48

Processing chunk 30/48

Processing chunk 31/48

Processing chunk 32/48

Processing chunk 33/48

Processing chunk 34/48

Processing chunk 35/48

Processing chunk 36/48

Processing chu

Deduplicate Nodes + Edges

In [13]:
unique_nodes = {}
for n in all_nodes:
    if "id" in n:
        unique_nodes[n["id"]] = n

nodes = list(unique_nodes.values())

seen = set()
edges = []
for e in all_edges:
    if {"from","relation","to"} <= e.keys():
        key = (e["from"], e["relation"], e["to"])
        if key not in seen:
            seen.add(key)
            edges.append(e)

print("Final Nodes:", len(nodes))
print("Final Edges:", len(edges))


Final Nodes: 206
Final Edges: 249


Save Final KG

In [14]:
json.dump({"nodes": nodes, "edges": edges}, open("dse_kg.json","w"), indent=2)
print("Saved dse_kg.json")

pd.DataFrame(nodes).to_csv("nodes.csv", index=False)
pd.DataFrame(edges).to_csv("edges.csv", index=False)

print("Saved nodes.csv and edges.csv")


Saved dse_kg.json
Saved nodes.csv and edges.csv
