<a href="https://colab.research.google.com/github/xingji1337/week6HO/blob/tracka/Week6_1_GraphRAG_Build_fromCSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week6-1 (from CSV): Graph-RAG Build

**Goal:** Load a Week 5 corpus CSV (e.g., `week5_corpus.csv`), clean it, extract entities, and build a graph.
Outputs:
- `fromcsv_outputs/week6_corpus_clean.csv`
- `fromcsv_outputs/graph_store.json`
- Updates `rag_graph_run_config.json` to point to the new graph.


## 0) Environment & installs
- spaCy baseline NER (switch to scispaCy/LLM later if needed).


In [1]:

# Optional installs (uncomment when running in a fresh environment)
#!pip install -q pandas spacy networkx sentence-transformers tqdm
#!python -m spacy download en_core_web_sm


## 1) Config

In [2]:

from pathlib import Path
import json

CSV_PATH = "/mnt/data/week5_corpus.csv"   # <-- change if needed
TEXT_COLUMN = None
DOC_ID_COLUMN = None

OUT_DIR = Path("fromcsv_outputs")
OUT_DIR.mkdir(exist_ok=True, parents=True)

GRAPH_STORE = str(OUT_DIR / "graph_store.json")
CLEAN_CSV = str(OUT_DIR / "week6_corpus_clean.csv")

EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
NEIGHBOR_HOPS = 2
CHUNK_MAX_LEN = 600
MIN_CHARS = 250
MAX_CHARS = 10000
MIN_ALPHA_RATIO = 0.55


## 2) Load CSV + clean

In [6]:
import re
import pandas as pd

def normalize_text(t: str) -> str:
    if not isinstance(t, str): t = str(t)
    t = t.replace("\r\n","\n").replace("\r","\n")
    t = re.sub(r"-\n(\w)", r"\1", t)   # de-hyphenate
    t = re.sub(r"\n{2,}", "\n\n", t)
    t = re.sub(r"[ \t]{2,}", " ", t)
    return t.strip()

def alpha_ratio(t: str) -> float:
    if not t: return 0.0
    alphas = sum(ch.isalpha() for ch in t)
    return alphas / max(1, len(t))

def auto_load_csv(path: str, text_col_hint=None, doc_id_col_hint=None) -> pd.DataFrame:
    df = pd.read_csv(path)
    if text_col_hint is None:
        for cand in ["text", "content", "passage", "body", "chunk", "page_text"]:
            if cand in df.columns:
                text_col_hint = cand
                break
        if text_col_hint is None:
            lens = {c: df[c].astype(str).str.len().mean() for c in df.columns}
            text_col_hint = max(lens, key=lens.get)

    if doc_id_col_hint is not None and doc_id_col_hint in df.columns:
        df = df.rename(columns={text_col_hint: "text", doc_id_col_hint: "doc_id"})
    else:
        df = df.rename(columns={text_col_hint: "text"})
        # Only insert doc_id if it doesn't exist
        if "doc_id" not in df.columns:
            df.insert(0, "doc_id", range(len(df)))


    df = df[["doc_id", "text"]]
    return df

raw = auto_load_csv(CSV_PATH, TEXT_COLUMN, DOC_ID_COLUMN)
print(f"Loaded CSV: {CSV_PATH} with {len(raw)} rows")

raw["text"] = raw["text"].map(normalize_text)
raw["chars"] = raw["text"].str.len()
raw["alpha_ratio"] = raw["text"].map(alpha_ratio)

good = (raw["chars"].between(MIN_CHARS, MAX_CHARS)) & (raw["alpha_ratio"] >= MIN_ALPHA_RATIO)
clean = raw.loc[good, ["doc_id","text"]].drop_duplicates(subset=["text"]).reset_index(drop=True)
print(f"Filtered & deduped: {len(raw)} -> {len(clean)}")

clean.to_csv(CLEAN_CSV, index=False, encoding="utf-8")
print("Wrote cleaned CSV:", CLEAN_CSV)
clean.head(3)

Loaded CSV: /mnt/data/week5_corpus.csv with 348 rows
Filtered & deduped: 348 -> 341
Wrote cleaned CSV: fromcsv_outputs/week6_corpus_clean.csv


Unnamed: 0,doc_id,text
0,Complete home repair with 350 projects and 23...,►BLACKS DECKER'A .. .. - _ DAVENPORT PUBLIC LI...
1,Complete home repair with 350 projects and 23...,Roller . .. .69 Ceilings & Walls. Painting Win...
2,Complete home repair with 350 projects and 23...,projects from start to finish. • Color photogr...


## 3) Chunk text

In [7]:

from typing import List

def chunk_text(text: str, max_len: int = CHUNK_MAX_LEN) -> List[str]:
    words = text.split()
    chunks, cur, cur_len = [], [], 0
    for w in words:
        cur.append(w)
        cur_len += len(w) + 1
        if cur_len >= max_len:
            chunks.append(" ".join(cur))
            cur, cur_len = [], 0
    if cur:
        chunks.append(" ".join(cur))
    return chunks

chunks = []
for _, row in clean.iterrows():
    spans = chunk_text(row["text"], CHUNK_MAX_LEN)
    for si, span in enumerate(spans):
        chunks.append({"doc_id": str(row["doc_id"]), "span_id": si, "span": span})

import pandas as pd
df_spans = pd.DataFrame(chunks)
print("Total spans:", len(df_spans), "from documents:", len(clean))
df_spans.head(3)


Total spans: 2794 from documents: 341


Unnamed: 0,doc_id,span_id,span
0,Complete home repair with 350 projects and 23...,0,►BLACKS DECKER'A .. .. - _ DAVENPORT PUBLIC LI...
1,Complete home repair with 350 projects and 23...,1,"Illustrators: Elroy Balgaard, Patricia Goar, E..."
2,Complete home repair with 350 projects and 23...,2,President/CEO: Ken Fund Publisher: Bryan Trand...


## 4) Entity extraction (spaCy)

In [8]:

import spacy
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import sys, subprocess
    subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=False)
    nlp = spacy.load("en_core_web_sm")

def extract_entities(text: str):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

extract_entities(df_spans["span"].iloc[0]) if len(df_spans) else []


[('LIBRARY', 'PERSON'),
 ('32^', 'CARDINAL'),
 ('MAIN STREET DAVENPORT', 'ORG'),
 ('Creative Publishing', 'ORG'),
 ('MINNEAPOLIS', 'GPE'),
 ('MINNESOTA', 'ORG'),
 ('Tim Himsel Executive', 'ORG'),
 ('Bryan Trandem', 'PERSON'),
 ('Jerri Farris', 'PERSON'),
 ('Michelle Skudlarek Assisting Project Managers', 'PERSON'),
 ('Jeanette Moss McCurdy', 'PERSON'),
 ('Tracy Stanley', 'PERSON'),
 ('Thomas G. Lemmer', 'PERSON'),
 ('Brett Martin', 'PERSON'),
 ('Karen Ruth Technical', 'PERSON'),
 ('Timothy Bro', 'PERSON'),
 ('Robert Weaver Copy', 'PERSON'),
 ('Janice Cauley', 'PERSON'),
 ('Tracy Stanley', 'PERSON'),
 ('Dave Schelitzche Art', 'PERSON'),
 ('Jon Simpson Technical', 'PERSON')]

## 5) Build graph

In [11]:
import json, networkx as nx
from tqdm import tqdm

GraphRAG = None
try:
    from modules.graph_rag import GraphRAG as _GraphRAG
    GraphRAG = _GraphRAG
    print("Using modules.graph_rag.GraphRAG")
except Exception as e:
    print("modules.graph_rag not found, using inline GraphRAG.", e)
    from sentence_transformers import SentenceTransformer
    class GraphRAG:
        def __init__(self, model_name: str, graph_store_path: str, neighbor_hops: int = 2):
            self.model = SentenceTransformer(model_name)
            self.graph = nx.MultiDiGraph()
            self.graph_store_path = graph_store_path
            self.hops = neighbor_hops
        def add_entity(self, node_id: str, label: str, ntype: str, doc_id: str, span: str):
            if node_id not in self.graph:
                self.graph.add_node(node_id, id=node_id, label=label, type=ntype, evidence=[])
            ev = list(self.graph.nodes[node_id].get("evidence", [])) # Explicitly cast to list
            ev.append({"doc_id": doc_id, "span": span})
            self.graph.nodes[node_id]["evidence"] = ev
        def add_relation(self, src_id: str, dst_id: str, rtype: str, doc_id: str, span: str):
            self.graph.add_edge(src_id, dst_id, type=rtype, evidence={"doc_id": doc_id, "span": span})
        def save_graph(self) -> None:
            nodes = [dict(**self.graph.nodes[n]) for n in self.graph.nodes]
            edges = [dict(src=u, dst=v, **d) for u, v, d in self.graph.edges(data=True)]
            with open(self.graph_store_path, "w", encoding="utf-8") as f:
                json.dump({"nodes": nodes, "edges": edges}, f, ensure_ascii=False, indent=2)

gr = GraphRAG(EMBED_MODEL, graph_store_path=GRAPH_STORE, neighbor_hops=NEIGHBOR_HOPS)

for _, row in tqdm(df_spans.iterrows(), total=len(df_spans)):
    doc_id = row["doc_id"]
    span = row["span"]
    ents = extract_entities(span)
    ids = []
    for e_text, e_type in ents:
        nid = f"{e_text.lower()}::{e_type}"
        gr.add_entity(nid, e_text, e_type, doc_id, span)
        ids.append(nid)
    for i in range(len(ids)):
        for j in range(i+1, len(ids)):
            gr.add_relation(ids[i], ids[j], "CO_OCCURS", doc_id, span)

gr.save_graph()
print("Saved graph to:", GRAPH_STORE)

modules.graph_rag not found, using inline GraphRAG. No module named 'modules'


100%|██████████| 2794/2794 [00:51<00:00, 53.89it/s]


Saved graph to: fromcsv_outputs/graph_store.json


## 6) Update run config

In [12]:

cfg_path = Path("rag_graph_run_config.json")
if cfg_path.exists():
    cfg = json.loads(cfg_path.read_text())
else:
    cfg = {}

persist = cfg.get("persist_paths", {})
persist["graph_store"] = GRAPH_STORE
cfg["persist_paths"] = persist

graph_cfg = cfg.get("graph", {})
graph_cfg["neighbor_hops"] = NEIGHBOR_HOPS
cfg["graph"] = graph_cfg

cfg_path.write_text(json.dumps(cfg, indent=2))
print("Updated rag_graph_run_config.json -> persist_paths.graph_store =", GRAPH_STORE)


Updated rag_graph_run_config.json -> persist_paths.graph_store = fromcsv_outputs/graph_store.json


## 7) Sanity query

In [13]:

try:
    out = gr.answer_with_graph("Which dataset was used to evaluate Method A?")
    out
except Exception as e:
    print("If using inline GraphRAG, 'answer_with_graph' may not exist. Use modules.graph_rag for full feature.")
    print("Error:", e)


If using inline GraphRAG, 'answer_with_graph' may not exist. Use modules.graph_rag for full feature.
Error: 'GraphRAG' object has no attribute 'answer_with_graph'
