In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!git clone https://github.com/swaptikchowdhury/indicator-rag.git
%cd indicator-rag

fatal: destination path 'indicator-rag' already exists and is not an empty directory.
/content/indicator-rag


In [3]:
#Quick GPU check
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [4]:
!pip install -q pymupdf
!pip install -q sentence-transformers faiss-cpu
!pip install -q transformers accelerate bitsandbytes sentencepiece
!pip uninstall -y bitsandbytes
!pip install -U bitsandbytes

Found existing installation: bitsandbytes 0.49.0
Uninstalling bitsandbytes-0.49.0:
  Successfully uninstalled bitsandbytes-0.49.0
Collecting bitsandbytes
  Using cached bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Using cached bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.0


In [5]:
import os, json, re, hashlib, time, textwrap, faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
from tqdm import tqdm
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

#### Setup for trying the ingesting the pdf --> embedding -->FAISS (experimental)

In [7]:
# Ingest all pdfs --> create chunks on Google Drive

# paths (Drive)
DATA_ROOT = "/content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data"
PDF_DIR = os.path.join(DATA_ROOT, "pdfs")
WORK_DIR = os.path.join(DATA_ROOT, "work")
os.makedirs(WORK_DIR, exist_ok=True)

CHUNKS_PATH = os.path.join(WORK_DIR, "chunks.jsonl")

# helpers
def clean_text(s: str) -> str:
    s = s.replace("\x00", " ") # Removes junk Character
    s = re.sub(r"\s+", " ", s).strip() # collapses multiple spaces into one & trim edges
    return s

# Gives a deterministic id based on filename, # number and chunk index
# same PDF + page + chunk → same ID every run. Crucial for citations, and debugging later (“why did this chunk influence weight X?”)

def stable_id(*parts: str) -> str:
    h = hashlib.sha1("||".join(parts).encode("utf-8")).hexdigest()
    return h[:16]

def chunk_text(text: str, max_chars: int = 1400, overlap: int = 200) -> list[str]:
    '''
    Slices text into ~ 1400 character chunks

    '''
    if not text:
        return []
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(n, start + max_chars)
        chunk = text[start:end].strip() #take a slice & clean whitespace
        if chunk:
            chunks.append(chunk)
        start = end - overlap #move forward, but keep overlap
        if start < 0:
            start = 0
        if end == n:
            break
    return chunks

# main
pdf_files = [f for f in os.listdir(PDF_DIR) if f.lower().endswith(".pdf")]
print(f"Found {len(pdf_files)} PDFs in {PDF_DIR}")

written = 0
with open(CHUNKS_PATH, "w", encoding="utf-8") as out:
    for fname in tqdm(pdf_files, desc="PDFs"):
        pdf_path = os.path.join(PDF_DIR, fname)
        doc = fitz.open(pdf_path) #open pdf
        for pageno in range(len(doc)): # Loop over pages
            page_text = clean_text(doc[pageno].get_text("text")) # extract text. Pulls raw text and clean it
            for j, chunk in enumerate(chunk_text(page_text)):
                rec = {
                    "chunk_id": stable_id(fname, str(pageno), str(j)),
                    "doc": fname,
                    "page": pageno,
                    "chunk_index": j,
                    "text": chunk,
                }
                out.write(json.dumps(rec, ensure_ascii=False) + "\n")
                written += 1
        doc.close()

print("Wrote chunks:", written)
print("Saved to:", CHUNKS_PATH)


Found 21 PDFs in /content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data/pdfs


PDFs: 100%|██████████| 21/21 [00:03<00:00,  6.04it/s]

Wrote chunks: 1376
Saved to: /content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data/work/chunks.jsonl





In [8]:
# Building Embeddings + FAISS Index for retrieval

DATA_ROOT = "/content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data"
WORK_DIR = os.path.join(DATA_ROOT, "work")

CHUNKS_PATH = os.path.join(WORK_DIR, "chunks.jsonl")
INDEX_PATH = os.path.join(WORK_DIR, "faiss.index")
META_PATH = os.path.join(WORK_DIR, "meta.jsonl")

# Embedding model (open-source, fast, good baseline)
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# 1) Load chunk texts + metadata
texts = []
meta = []

with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        texts.append(rec["text"]) # List of Chunk Texts
        meta.append({ # This is chunk metadata
            "chunk_id": rec["chunk_id"],
            "doc": rec["doc"],
            "page": rec["page"],
            "chunk_index": rec["chunk_index"],
        })

print("Total chunks:", len(texts))
#Note: The texts[i] corresponds to meta[i] and will correspond to the vector stored at position i in FAISS.
#FAISS doesnt store metadata internally. As such we need to store metadata separately
#when FAISS says “top hit = vector #1234,” you look up meta[1234] to know where it came from

# 2) Embed
model = SentenceTransformer(EMB_MODEL_NAME)
emb = model.encode(
    texts,
    batch_size=64, #process 64 chunks at once (speed vs memory)
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True #makes each vector have length 1 because then cosine similarity between vectors = dot product (inner product) & lets us use a fast FAISS index type
).astype("float32") #FAISS expects float32 for best compatibility and memory efficiency

# 3) Build FAISS (Facebook AI similarity Search) index (cosine similarity via inner product on normalized vectors)
#FAISS stores all vectors and is thevector index library
dim = emb.shape[1] #dim is the embedding dimension (e.g., 384 for MiniLM).
index = faiss.IndexFlatIP(dim) #"flat” index: no compression, no clustering (simplest). IP: inner product similarity
index.add(emb) #adds every chunk vector to the index.

# 4) Save index + metadata
faiss.write_index(index, INDEX_PATH)

with open(META_PATH, "w", encoding="utf-8") as f:
    for m in meta:
        f.write(json.dumps(m, ensure_ascii=False) + "\n")

print("Embedding model:", EMB_MODEL_NAME)
print("FAISS index size:", index.ntotal)
print("Saved index:", INDEX_PATH)
print("Saved metadata:", META_PATH)


Total chunks: 1376


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Embedding model: sentence-transformers/all-MiniLM-L6-v2
FAISS index size: 1376
Saved index: /content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data/work/faiss.index
Saved metadata: /content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data/work/meta.jsonl


In [None]:
# Running a TRIAL retrieval query to see if it works
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# 1) Load FAISS index in to the memory
index = faiss.read_index(INDEX_PATH)

# 2) Load metadata (aligned with FAISS vector positions).Open meta.jsonl,Read it line by line, Convert each line (JSON) into a Python dict, & Append it to a list called meta
meta = []
with open(META_PATH, "r", encoding="utf-8") as f:
    for line in f:
        meta.append(json.loads(line))

# 3) Build a fast lookup from chunk_id -> text (for display)
chunk_text_by_id = {}
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        chunk_text_by_id[rec["chunk_id"]] = rec["text"]

# 4) Embed your query
model = SentenceTransformer(EMB_MODEL_NAME)

QUERY = "community impact indicator: health care and social assistance employment share definition"
qvec = model.encode([QUERY], normalize_embeddings=True).astype("float32")

# 5) Search
k = 5
scores, ids = index.search(qvec, k) #Among all stored chunk vectors, which k are closest to this query vector?ids[0] → indices of top-k vectors,scores[0] → similarity scores (higher = closer)

print("Query:", QUERY)
print("\nTop results:\n")

for rank, (idx, score) in enumerate(zip(ids[0], scores[0]), start=1):
    m = meta[idx]
    cid = m["chunk_id"]
    text = chunk_text_by_id.get(cid, "")
    preview = text[:500] + ("..." if len(text) > 500 else "")
    print(f"{rank}) score={score:.4f}  doc={m['doc']}  page={m['page']}  chunk_id={cid}")
    print("   preview:", preview)
    print()


#### PROMPT DEVELOPMENT AND FINALISING THE LLM PIPELINE

> Add blockquote



In [9]:
#Function for creating chunks for using in prompts

DATA_ROOT = "/content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data"
WORK_DIR = os.path.join(DATA_ROOT, "work")

CHUNKS_PATH = os.path.join(WORK_DIR, "chunks.jsonl")
INDEX_PATH = os.path.join(WORK_DIR, "faiss.index")
META_PATH = os.path.join(WORK_DIR, "meta.jsonl")

# embedding model (must match the one used to build the FAISS index)
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# load index + metadata + chunk text
index = faiss.read_index(INDEX_PATH)

meta = []
with open(META_PATH, "r", encoding="utf-8") as f:
    for line in f:
        meta.append(json.loads(line))

chunk_text_by_id = {}
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        chunk_text_by_id[rec["chunk_id"]] = rec["text"]

embedder = SentenceTransformer(EMB_MODEL_NAME)

def retrieve_chunks(query: str, k: int = 5):
    """Returns list of dicts: chunk_id, doc, page, chunk_index, score, text."""
    qvec = embedder.encode([query], normalize_embeddings=True).astype("float32")
    scores, ids = index.search(qvec, k)

    out = []
    for score, idx in zip(scores[0], ids[0]):
        m = meta[idx]
        cid = m["chunk_id"]
        out.append({
            "chunk_id": cid,
            "doc": m["doc"],
            "page": m["page"],
            "chunk_index": m["chunk_index"],
            "score": float(score),
            "text": chunk_text_by_id.get(cid, ""),
        })
    return out

print("Loaded:", {"chunks": len(chunk_text_by_id), "meta": len(meta), "index_ntotal": index.ntotal})


Loaded: {'chunks': 1376, 'meta': 1376, 'index_ntotal': 1376}


In [None]:
#test_query = "Community Impact: Availability/strength of local social, civic, and institutional support"
#hits = retrieve_chunks(test_query, k=5)
#[(h["score"], h["doc"], h["page"], h["chunk_id"]) for h in hits]


In [10]:
#Minimal cache (JSONL) + fingerprint key + skip-if-done
#creates an append-only cache file in Drive and lets you check “did we already run this exact item × justice with these retrieved chunks?”

os.makedirs(WORK_DIR, exist_ok=True)

CACHE_PATH = os.path.join(WORK_DIR, "runs_cache.jsonl")
MODEL_NAME = "YOUR_MODEL_NAME_HERE"   # fill later
PROMPT_VERSION = "v1"
RETRIEVAL_K = 50

def fingerprint(item_type, item_name, justice_type, retrieved_chunk_ids, decoding):
    payload = {
        "model": MODEL_NAME,
        "prompt_version": PROMPT_VERSION,
        "retrieval_k": RETRIEVAL_K,
        "item_type": item_type,
        "item_name": item_name,
        "justice_type": justice_type,
        "retrieved_chunk_ids": retrieved_chunk_ids,  # ordered list
        "decoding": decoding,  # dict: temperature, max_new_tokens, etc.
    }
    s = json.dumps(payload, sort_keys=True, ensure_ascii=True)
    return hashlib.sha1(s.encode("utf-8")).hexdigest(), payload

def load_cached_keys():
    keys = set()
    if not os.path.exists(CACHE_PATH):
        return keys
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rec = json.loads(line)
            keys.add(rec["run_id"])
    return keys

def append_cache_record(run_id, payload, result):
    rec = {
        "run_id": run_id,
        "ts_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "payload": payload,
        "result": result,  # your flat output record later
    }
    with open(CACHE_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

cached_keys = load_cached_keys()
print("Cache path:", CACHE_PATH)
print("Cached runs:", len(cached_keys))


Cache path: /content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data/work/runs_cache.jsonl
Cached runs: 1


In [11]:
INDICATOR_XLSX = "/content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data/data/9-14 Metrics_Def_LLM.xlsx"

df_ind = pd.read_excel(INDICATOR_XLSX)

# basic clean
df_ind["Indicator"] = df_ind["Indicator"].astype(str).str.strip()
df_ind["Definition"] = df_ind["Definition"].astype(str).str.strip()

indicator_defs = dict(zip(df_ind["Indicator"], df_ind["Definition"]))

print("Loaded indicator defs:", len(indicator_defs))
print("Sample:", list(indicator_defs.items())[:3])


Loaded indicator defs: 16
Sample: [('Total Employment  By Powerplant', 'Employees in powerplant'), ('Location Quotient', 'The location quotient is the ratio of the local share of an industry’s employment (or other measure) to the national share of that same industry. Its being used as a metric for how important that powerplant job is in that county where the NG powerplant is'), ('Relative Compensation', 'Relative compensation is defined as the ratio of the median income of employees working in power plants to the per-capita personal income at the county or state level, expressed in dollars, which indicates how earnings in the power sector compare to the general income levels of the surrounding population.')]


In [12]:
# justice definitions (authoritative)
justice_defs = {
  "Utilitarianism": "An ethical theory that acts as a proxy for different types of 'individual preferences. Utilitarianism is usually defined as maximizing total welfare as defined by the goal. In this context, it will achieving the plant closures with minimum direct cost overall",
  "Sufficientarianism": "An ethical theory that is a proxy for different types of 'individual preferences.Under sufficientarianism, priority is given to ensuring that basic or decent levels of needs are met by providing a threshold of goods and services. In this context, ethical priority is placed on giving the greatest weight to the concerns of the worst off",
  "Egalitarianism": "An ethical theory that acts as a proxy for different types of 'individual preferences.Egalitarianism minimizes differences in obtaining resources and making sure that everyone receives the same quantity. In this context, each community bears even/proportionate costs and benefits of intervention",
}

# metric definitions (authoritative)
metric_defs = {
  "Economic Impact": "Effects on jobs, income, local business activity, and fiscal/tax base linked to plant operation/closure.Associated Indicators are Total Employment  By Powerplant, Location Quotient and Relative Compensation.",
  "Environmental Impacts": "Effects on air pollutants and greenhouse gases and related local environmental quality.Associated Indicators are SO2 Emissions,NOx Emission and CO2 Emissions.",
  "Community Impact": "Availability/strength of local social, civic, and institutional support. Associated indicators are Health care and social assistance and Religious, GrantMaking, Civic, Professional, and similar org.",
  "Affordability": "Costs borne by households and the system. Associated indicators are Energy Burden, Average Annual Energy Cost,Levelised Cost of Energy (LCOE).",
  "Powerplant Characteristics": "Plant technical/operational attributes. Associated indicators are Capacity, Age of Plant and Heat Rate.",
  "Living Standard": "Socioeconomic disadvantage and well-being status. Associated indicators are Disadvantage community and Area Deprivation Index.",
}



In [13]:
ALLOWED_WEIGHTS = ["High","Medium","Low","None"]
ALLOWED_PRIORITIES = [1, 2]

def build_prompt(
    item_type: str,           # "metric" or "indicator"
    item_name: str,
    justice_type: str,        # one of justice_defs keys
    retrieved_chunks: list,   # list of dicts from retrieve_chunks(...), length K
    max_reason_words: int = 60,
):
    if justice_type not in justice_defs:
        raise ValueError(f"Unknown justice_type: {justice_type}")
    if item_type not in ("metric", "indicator"):
        raise ValueError("item_type must be 'metric' or 'indicator'")

    if item_type == "metric":
        if item_name not in metric_defs:
            raise ValueError(f"Metric not found in metric_defs: {item_name}")
        item_definition = metric_defs[item_name]
    else:
        if item_name not in indicator_defs:
            raise ValueError(f"Indicator not found in indicator_defs: {item_name}")
        item_definition = indicator_defs[item_name]

    header = f"""
You are an expert evidence-bound annotator with strong reasoning capabilities.

DEFINITION LOCK:
- Use ONLY the exact definitions provided below.
- If a chunk contains different terminology, ignore it and use the provided definitions only.
- Justice type in this run: {justice_type}
- Allowed weight labels: {json.dumps(ALLOWED_WEIGHTS)}
- Priority levels allowed: {json.dumps(ALLOWED_PRIORITIES)}

IMPORTANT CONTEXT:
- You are evaluating IMPORTANCE for DECISION-MAKING about NATURAL GAS POWER PLANT PHASEOUT.
- Importance refers to how strongly this item should influence phaseout decisions under the given justice definition.

Authoritative justice definition:
{justice_type}: {justice_defs[justice_type]}

Item to evaluate:
- item_type: {item_type}
- item_name: {item_name}
- item_definition (authoritative): {item_definition}

Retrieved evidence (K={len(retrieved_chunks)}):
""".strip()

    parts = [textwrap.dedent(header)]

    for ch in retrieved_chunks:
        parts.append("--- CHUNK BEGIN ---")
        parts.append(f"chunk_id: {ch['chunk_id']}")
        parts.append("text:")
        parts.append(ch["text"])
        parts.append("--- CHUNK END ---")

    task = f"""
---
TASK:

WEIGHT MEANINGS (Importance scale):
- High: The item should strongly influence phaseout decisions under this justice definition; excluding it would likely change decisions or substantially weaken justice alignment.
- Medium: The item should meaningfully influence decisions, but is not decisive on its own; excluding it would moderately affect decisions or justice alignment.
- Low: The item is marginal for decision-making under this justice definition; excluding it would have little effect on decisions or justice alignment.
- None: The item is not relevant to decision-making under this justice definition in this context.

Assign exactly one weight for the item under {justice_type}:
- weight ∈ {json.dumps(ALLOWED_WEIGHTS)}
- priority ∈ {json.dumps(ALLOWED_PRIORITIES)}
- reasoning (≤ {max_reason_words} words)

EVIDENCE & PRIORITY RULES (ONLY TWO MODES):

Priority 1 — EVIDENCE-LED (Chunk-based):
- You may use priority = 1 ONLY if the retrieved evidence contains explicit textual support for the item_definition.
- Explicit textual support means the chunk text literally mentions a concrete element named in the item_definition
  (e.g., "health care", "social assistance", "civic organizations", "religious organizations",
  "institutional support", or clearly equivalent phrasing).
- Generic discussion of indicators, frameworks, validation, resilience theory, or methodology DOES NOT qualify.
- For priority = 1:
  - Your reasoning MUST be based ONLY on what is stated in the cited chunks.
  - Do NOT introduce general theory or assumptions beyond the chunk text.
  - cited_chunks MUST contain BETWEEN 3 AND 5 chunks that meet the above criterion.
  - If you cannot identify at least 3 such chunks, priority = 1 is NOT allowed.

Priority 2 — THEORY-LED (General reasoning):
- Use priority = 2 ONLY if NO retrieved chunk provides explicit textual support for the item_definition.
- For priority = 2:
  - cited_chunks MUST be an empty list [].
  - Your reasoning MUST rely on the justice definition and general normative reasoning,
    NOT on the retrieved evidence.

CITATION RULES (MANDATORY):
- If priority = 1:
  - cited_chunks MUST contain BETWEEN 3 AND 5 objects.
  - Each object MUST have EXACT keys: "chunk_id", "chunk_text".
  - chunk_text MUST be copied verbatim from the retrieved evidence.
- If priority = 2:
  - cited_chunks MUST be [].

CHUNK TEXT COPYING RULE (MANDATORY):
- Copy the entire chunk text exactly as shown between "text:" and "--- CHUNK END ---".
- Do NOT truncate, summarize, or paraphrase.

OUTPUT RULES (CRITICAL):
- Output ONLY a single valid JSON object.
- Do NOT include headings, explanations, labels, or any text outside the JSON.
- Do NOT add any keys beyond those defined in the schema.
- The JSON must be syntactically valid.

OUTPUT JSON SCHEMA (fill with real values):
{{
  "item_type": "{item_type}",
  "item_name": "{item_name}",
  "justice_type": "{justice_type}",
  "weight": "High|Medium|Low|None",
  "priority": 1|2,
  "reasoning": "string",
  "cited_chunks": []
}}

HARD STOP:
- Begin your response with the first character '{{'.
- End your response with the matching '}}'.
- Output NOTHING before or after the JSON object.
""".strip()

    parts.append(textwrap.dedent(task))
    return "\\n".join(parts)

print("FULL tightened build_prompt loaded.")



FULL tightened build_prompt loaded.


Selected Models:

1. Mistral-7B-Instruct-v0.2
Why? Very strong instruction following, Handles structured prompts + JSON output reliably, Good at reasoning over provided evidence, Widely cited in academic work (safe choice), Fast and stable on Colab (4-bit)

2. Gemma-7B-It
Why? Different training philosophy than Mistral, Strong at normative reasoning and abstractions, Often more conservative than Mistral (important contrast), Handles long context reasonably well

3. LLaMA-2-13B-Chat
why? Larger parameter count → different inductive bias, More verbose, sometimes more cautious, Commonly used in academic baselines, Strong general reasoning, weaker instruction rigidity (interesting tradeoff). A size-scaled comparison

In [14]:
MODEL_REGISTRY = {
    "mistral": {
        "name": "mistralai/Mistral-7B-Instruct-v0.2",
        "chat": False
    },
    "gemma": {
        "name": "google/gemma-7b-it",
        "chat": False
    },
    "llama2": {
        "name": "meta-llama/Llama-2-13b-chat-hf",
        "chat": True
    }
}


In [15]:
def load_model(model_key: str):
    cfg = MODEL_REGISTRY[model_key]
    tokenizer = AutoTokenizer.from_pretrained(cfg["name"], use_fast=False)

    model = AutoModelForCausalLM.from_pretrained(
        cfg["name"],
        device_map="auto",
        torch_dtype=torch.float16
    )
    return tokenizer, model

In [16]:
def run_llm(prompt: str, tokenizer, model, max_new_tokens=2000):
    messages = [
        {"role": "user", "content": prompt}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        out = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    generated_tokens = out[0][inputs.shape[1]:]
    return tokenizer.decode(generated_tokens, skip_special_tokens=True)


In [17]:
MODEL_KEY = "mistral"  # change to "gemma" or "llama2"

tokenizer, model = load_model(MODEL_KEY)

item_type = "metric"
item_name = "Community Impact"
justice_type = "Egalitarianism"

query = f"{item_name}. Definition: {metric_defs[item_name]}"
hits = retrieve_chunks(query, k=5)

prompt = build_prompt(
    item_type=item_type,
    item_name=item_name,
    justice_type=justice_type,
    retrieved_chunks=hits,
)

response = run_llm(prompt, tokenizer, model)
print(response)


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 {
"item_type": "metric",
"item_name": "Community Impact",
"justice_type": "Egalitarianism",
"weight": "Medium",
"priority": 2,
"reasoning": "The egalitarian justice definition prioritizes minimizing differences in obtaining resources and ensuring even distribution of costs and benefits. The community impact metric, which measures the availability and strength of local social, civic, and institutional support, aligns with this goal as it contributes to the overall well-being and resilience of communities. However, the retrieved evidence does not provide explicit textual support for the community impact metric within the context of egalitarian justice. Instead, the evidence discusses the challenges and complexities of community resilience measurement and validation, emphasizing the importance of selecting appropriate indicators and addressing issues related to indicator validity and reliability. While this evidence does not directly support the weighting of the community impact metric u

In [26]:
# 0) Minimal model registry (edit to your chosen 3 models)

MODEL_REGISTRY = {
    "mistral": {"name": "mistralai/Mistral-7B-Instruct-v0.3"},
    "gemma":   {"name": "google/gemma-2-9b-it"},
    "llama2":  {"name": "meta-llama/Llama-2-7b-chat-hf"},
}

MODEL_KEY = "mistral"  # set this
MODEL_NAME = MODEL_REGISTRY[MODEL_KEY]["name"]  # keep cache honest

PROMPT_VERSION = "v1"
RETRIEVAL_K = 5  # MUST match what you actually pass into retrieve_chunks


# 1) Model load + run_llm (cleaned)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_model(model_key: str):
    cfg = MODEL_REGISTRY[model_key]
    tokenizer = AutoTokenizer.from_pretrained(cfg["name"], use_fast=False)

    # Avoid pad warnings / weird truncation behavior
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        cfg["name"],
        device_map="auto",
        torch_dtype=torch.float16
    )
    model.eval()
    return tokenizer, model

def run_llm(prompt: str, tokenizer, model, max_new_tokens=2000):
    messages = [
        {"role": "user", "content": prompt}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        out = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    generated_tokens = out[0][inputs.shape[1]:]
    return tokenizer.decode(generated_tokens, skip_special_tokens=True)


# 2) Cache

CACHE_PATH = os.path.join(WORK_DIR, "runs_cache.jsonl")

def fingerprint(item_type, item_name, justice_type, retrieved_chunk_ids, decoding):
    payload = {
        "model": MODEL_NAME,
        "prompt_version": PROMPT_VERSION,
        "retrieval_k": RETRIEVAL_K,
        "item_type": item_type,
        "item_name": item_name,
        "justice_type": justice_type,
        "retrieved_chunk_ids": retrieved_chunk_ids,  # ordered list
        "decoding": decoding,
    }
    s = json.dumps(payload, sort_keys=True, ensure_ascii=True)
    return hashlib.sha1(s.encode("utf-8")).hexdigest(), payload

def load_cached_keys():
    keys = set()
    if not os.path.exists(CACHE_PATH):
        return keys
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rec = json.loads(line)
            keys.add(rec["run_id"])
    return keys

def append_cache_record(run_id, payload, result):
    rec = {
        "run_id": run_id,
        "ts_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "payload": payload,
        "result": result,
    }
    with open(CACHE_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

cached_keys = load_cached_keys()
print("Cache:", CACHE_PATH, "cached runs:", len(cached_keys))


# 3) Robust JSON extraction

def extract_first_json_object(text: str) -> dict:
    """
    Finds the first {...} block and parses it as JSON.
    Sanitizes invalid control characters that can break json.loads.
    """
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in model output.")
    block = m.group(0).strip()

    # Remove ASCII control chars except \n \r \t (and even those can break if unescaped inside strings)
    # Most robust minimal approach: strip all control chars 0x00-0x1F
    block = re.sub(r"[\x00-\x1F]", "", block)

    return json.loads(block)

def validate_result_schema(d: dict):
    required = ["item_type","item_name","justice_type","weight","priority","reasoning","cited_chunks"]
    missing = [k for k in required if k not in d]
    if missing:
        raise ValueError(f"Missing keys: {missing}")

    if d["weight"] not in ALLOWED_WEIGHTS:
        raise ValueError(f"Invalid weight: {d['weight']}")

    if d["priority"] not in ALLOWED_PRIORITIES:
        raise ValueError(f"Invalid priority: {d['priority']}")

    if not isinstance(d["cited_chunks"], list):
        raise ValueError("cited_chunks must be a list")


# 4) One-item runner

def run_one_item(item_type: str, item_name: str, justice_type: str, tokenizer, model):
    # Retrieval query (simple default)
    if item_type == "metric":
        q = f"{item_name}. Definition: {metric_defs[item_name]}"
    else:
        q = f"{item_name}. Definition: {indicator_defs[item_name]}"

    hits = retrieve_chunks(q, k=RETRIEVAL_K)
    retrieved_chunk_ids = [h["chunk_id"] for h in hits]

    decoding = {"do_sample": False, "max_new_tokens": 2000}
    run_id, payload = fingerprint(item_type, item_name, justice_type, retrieved_chunk_ids, decoding)

    # Skip if cached
    if run_id in cached_keys:
        return {"_cached": True, "run_id": run_id, "item_type": item_type, "item_name": item_name, "justice_type": justice_type}

    prompt = build_prompt(
        item_type=item_type,
        item_name=item_name,
        justice_type=justice_type,
        retrieved_chunks=hits,
    )

    raw = run_llm(prompt, tokenizer, model, max_new_tokens=2000)

    # Parse + validate
    out = extract_first_json_object(raw)
    validate_result_schema(out)

    # Attach minimal provenance (not changing your JSON style; this is outside it)
    record = {
        **out,
        "_cached": False,
        "run_id": run_id,
        "model": MODEL_NAME,
        "prompt_version": PROMPT_VERSION,
        "retrieval_k": RETRIEVAL_K,
        "retrieved_chunk_ids": retrieved_chunk_ids,
    }

    append_cache_record(run_id, payload, record)
    cached_keys.add(run_id)
    return record


# 5) Justice runner: all metrics + indicators -> DataFrame

def run_one_justice(justice_type: str, tokenizer, model):
    records = []

    # metrics
    for m in metric_defs.keys():
        rec = run_one_item("metric", m, justice_type, tokenizer, model)
        records.append(rec)

    # indicators
    for ind in indicator_defs.keys():
        rec = run_one_item("indicator", ind, justice_type, tokenizer, model)
        records.append(rec)

    df = pd.DataFrame(records)

    # helpful flattened columns for analysis
    if "cited_chunks" in df.columns:
        df["cited_chunk_ids"] = df["cited_chunks"].apply(
            lambda xs: [x.get("chunk_id") for x in xs] if isinstance(xs, list) else []
        )

    return df, records

print("Runner loaded.")


Cache: /content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data/work/runs_cache.jsonl cached runs: 20
Runner loaded.


In [27]:
#Run this to confirm the pipeline works end-to-end before you burn GPU time:
tokenizer, model = load_model(MODEL_KEY)

# 1 metric
rec_m = run_one_item("metric", "Community Impact", "Egalitarianism", tokenizer, model)
print("Metric result:", rec_m)




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Metric result: {'_cached': True, 'run_id': 'c233fa8889e0db8aafe5c3effcb2f3753b79669d', 'item_type': 'metric', 'item_name': 'Community Impact', 'justice_type': 'Egalitarianism'}


In [28]:
# 1 indicator (pick one that exists in your Excel)
#some_indicator = list(indicator_defs.keys())[0]
rec_i = run_one_item("indicator", "Location Quotient", "Egalitarianism", tokenizer, model)
print("Indicator result:", rec_i)

Indicator result: {'_cached': True, 'run_id': '7fde0e74b5f9ce7584f252b6194948bb42c140df', 'item_type': 'indicator', 'item_name': 'Location Quotient', 'justice_type': 'Egalitarianism'}


In [30]:
def run_one_justice_to_df(justice_type: str, tokenizer, model):
    """
    Runs ALL metrics and ALL indicators for ONE justice type.
    Returns a pandas DataFrame.
    """

    records = []

    # Metrics
    for metric_name in metric_defs.keys():
        rec = run_one_item(
            item_type="metric",
            item_name=metric_name,
            justice_type=justice_type,
            tokenizer=tokenizer,
            model=model
        )
        records.append(rec)

    # Indicators
    for indicator_name in indicator_defs.keys():
        rec = run_one_item(
            item_type="indicator",
            item_name=indicator_name,
            justice_type=justice_type,
            tokenizer=tokenizer,
            model=model
        )
        records.append(rec)

    df = pd.DataFrame(records)

    # Optional but useful: flatten cited chunk ids
    if "cited_chunks" in df.columns:
        df["cited_chunk_ids"] = df["cited_chunks"].apply(
            lambda xs: [x["chunk_id"] for x in xs] if isinstance(xs, list) else []
        )

    return df

In [31]:
JUSTICE = "Egalitarianism"  # or "Utilitarianism", "Sufficientarianism"

tokenizer, model = load_model(MODEL_KEY)

df_egal = run_one_justice_to_df(JUSTICE, tokenizer, model)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [32]:
df_egal

Unnamed: 0,_cached,run_id,item_type,item_name,justice_type,weight,priority,reasoning,cited_chunks,model,prompt_version,retrieval_k,retrieved_chunk_ids,cited_chunk_ids
0,True,cf07cc1d0b6b685ba153375b944495392164072e,metric,Economic Impact,Egalitarianism,,,,,,,,,[]
1,True,2e6631c7f5d5ab93ac7ce33b7537c4d9198c26a2,metric,Environmental Impacts,Egalitarianism,,,,,,,,,[]
2,True,c233fa8889e0db8aafe5c3effcb2f3753b79669d,metric,Community Impact,Egalitarianism,,,,,,,,,[]
3,True,a76735d7f69eb119a0cdf50ad4a2e8081ca38586,metric,Affordability,Egalitarianism,,,,,,,,,[]
4,True,25fd2f854f874346af011b914b817f638c0a52cb,metric,Powerplant Characteristics,Egalitarianism,,,,,,,,,[]
5,True,63fa62c9514714e9ebe4ba2955d4ed0f74723e07,metric,Living Standard,Egalitarianism,,,,,,,,,[]
6,True,f77ebc616ba785d89eb55a18d40dc5fdc866be19,indicator,Total Employment By Powerplant,Egalitarianism,,,,,,,,,[]
7,True,7fde0e74b5f9ce7584f252b6194948bb42c140df,indicator,Location Quotient,Egalitarianism,,,,,,,,,[]
8,True,236f65edc58052f1c73085bd95bdfc97c01043df,indicator,Relative Compensation,Egalitarianism,,,,,,,,,[]
9,True,c7444fb021a9f6e0d1c92de0a01cfb5df3ded4ce,indicator,SO2 Emissions,Egalitarianism,,,,,,,,,[]


In [34]:
def load_cached_results_to_df(justice_type: str):
    records = []

    if not os.path.exists(CACHE_PATH):
        raise FileNotFoundError(f"Cache file not found: {CACHE_PATH}")

    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            result = rec["result"]

            if result.get("justice_type") == justice_type:
                records.append(result)

    df = pd.DataFrame(records)

    # Optional helper column
    if "cited_chunks" in df.columns:
        df["cited_chunk_ids"] = df["cited_chunks"].apply(
            lambda xs: [x["chunk_id"] for x in xs] if isinstance(xs, list) else []
        )

    return df


In [35]:
DATA_ROOT = "/content/drive/MyDrive/Dissertation/12-21_inidcator_rag_data"
CACHE_PATH = os.path.join(WORK_DIR, "runs_cache.jsonl")
JUSTICE = "Egalitarianism"
df_egal_cache = load_cached_results_to_df(JUSTICE)
df_egal_cache.head()


Unnamed: 0,item_type,item_name,justice_type,weight,priority,reasoning,cited_chunks,_cached,run_id,model,prompt_version,retrieval_k,retrieved_chunk_ids,cited_chunk_ids
0,metric,Community Impact,Egalitarianism,Medium,2,The evidence does not provide explicit support...,[],False,c233fa8889e0db8aafe5c3effcb2f3753b79669d,mistralai/Mistral-7B-Instruct-v0.3,v1,5,"[8c0b38a689c42582, f06bec54673dd65a, 2f6343933...",[]
1,indicator,Location Quotient,Egalitarianism,Medium,2,The Location Quotient measures the importance ...,[],False,7fde0e74b5f9ce7584f252b6194948bb42c140df,mistralai/Mistral-7B-Instruct-v0.3,v1,5,"[0ba5562f440f2a36, 012223331ed86af9, c35eda0cb...",[]
2,metric,Economic Impact,Egalitarianism,High,1,The evidence shows that the economic impact of...,"[{'chunk_id': 'd62904194c2074bf', 'chunk_text'...",False,cf07cc1d0b6b685ba153375b944495392164072e,mistralai/Mistral-7B-Instruct-v0.3,v1,5,"[d62904194c2074bf, e46a98c1303c112e, b7d75c7aa...","[d62904194c2074bf, e46a98c1303c112e, b7d75c7aa..."
3,metric,Environmental Impacts,Egalitarianism,High,1,The evidence shows that emissions from natural...,"[{'chunk_id': '8a4072701ad93b6e', 'chunk_text'...",False,2e6631c7f5d5ab93ac7ce33b7537c4d9198c26a2,mistralai/Mistral-7B-Instruct-v0.3,v1,5,"[8a4072701ad93b6e, fbd5836ee756deb8, 0b4d81605...","[8a4072701ad93b6e, fbd5836ee756deb8]"
4,metric,Affordability,Egalitarianism,High,1,The evidence shows that energy burdens vary ac...,"[{'chunk_id': '81b4f30e8a472e3f', 'chunk_text'...",False,a76735d7f69eb119a0cdf50ad4a2e8081ca38586,mistralai/Mistral-7B-Instruct-v0.3,v1,5,"[81b4f30e8a472e3f, 71e96895f15b2fe6, 077c189e8...","[81b4f30e8a472e3f, 81b4f30e8a472e3f]"


In [36]:


# C) Combine
df_all = pd.concat([df_egal, df_egal_cache], ignore_index=True)

# D) Drop duplicates by run_id, keeping the most complete row
# (sort so rows with weight present come last, then keep last)
df_all["_has_weight"] = df_all["weight"].notna() if "weight" in df_all.columns else False
df_all = df_all.sort_values("_has_weight").drop_duplicates(subset=["run_id"], keep="last")
df_all = df_all.drop(columns=["_has_weight"])

# Optional: tidy index
df_all = df_all.reset_index(drop=True)


In [37]:
df_all

Unnamed: 0,_cached,run_id,item_type,item_name,justice_type,weight,priority,reasoning,cited_chunks,model,prompt_version,retrieval_k,retrieved_chunk_ids,cited_chunk_ids
0,False,cdc009ee3866448e35f68d031be5b3d66ce86e77,indicator,CO2 Emissions,Egalitarianism,High,2.0,The provided evidence discusses carbon emissio...,[],mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[049ce194fa549da4, 801d81ff785fed5f, 1fda32269...",[]
1,False,674f2df396ece58e0c06690e0a2aa8322d799fa3,indicator,Health care and social assistance,Egalitarianism,High,1.0,The evidence shows that the Community Health a...,"[{'chunk_id': 'c7d29c01a7e0a856', 'chunk_text'...",mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[c7d29c01a7e0a856, 4f65d12c90605fbc, a14832aea...","[c7d29c01a7e0a856, 4f65d12c90605fbc]"
2,False,8e9fec2b46729b896713240caa2a01469225701f,indicator,"Religious, GrantMaking, Civic, Professional, a...",Egalitarianism,Medium,2.0,The evidence does not provide explicit textual...,[],mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[c7d29c01a7e0a856, ba78793981dbd98b, 57cdd2787...",[]
3,False,7b459c5858addf723f1934b94feabc5d2a16e29b,indicator,Energy Burden,Egalitarianism,High,2.0,"The energy burden indicator, defined as the pe...",[],mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[42427f576b66bd76, 81b4f30e8a472e3f, 739dcbd94...",[]
4,False,05c79d78df9edfab9868c73910822be308a78102,indicator,Heat Rate,Egalitarianism,High,1.0,The heat rate is a measure of the efficiency o...,"[{'chunk_id': '0414984cf826d5de', 'chunk_text'...",mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[0414984cf826d5de, 56b33383c2609c48, b4f4d7e20...","[0414984cf826d5de, 56b33383c2609c48, b4f4d7e20..."
5,False,be0ef9685ce2fc15204b0d65e9d401423f19d3ab,indicator,Levelised Cost of Energy (LCOE),Egalitarianism,Medium,2.0,The Levelised Cost of Energy (LCOE) is a stand...,[],mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[9b563c706e647ef0, cfef8abc9f7e423a, 51d5ff9d3...",[]
6,False,10a0b15391fface23c9dde1ec8dc165e5c91f806,indicator,Capacity,Egalitarianism,High,1.0,"The evidence shows that capacity factor, which...","[{'chunk_id': '09304cecb2bb207a', 'chunk_text'...",mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[09304cecb2bb207a, c7bcd3a0274dd678, 244971bce...","[09304cecb2bb207a, c7bcd3a0274dd678, 244971bce..."
7,False,878e35d4786b88b55c0eafbf250724ca65ea9444,indicator,Age of Plant,Egalitarianism,High,1.0,The evidence shows that the age of the plant p...,"[{'chunk_id': '23fc721e65726fbc', 'chunk_text'...",mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[23fc721e65726fbc, 42f5bc02b98c455e, 225207715...","[23fc721e65726fbc, 2252077159bb05d0]"
8,False,667bc242a6bfb0fbcbfcaa1b7ce9914ea100d8ea,indicator,NOx Emission,Egalitarianism,High,2.0,The NOx Emission indicator is highly relevant ...,[],mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[326cec27d38e7bff, cdf7c7d1f519ec5d, 5536d1c67...",[]
9,False,0e0fa06c6b347f4d39a98c573802d3eb225fd22a,indicator,Average Annual Energy Cost,Egalitarianism,High,1.0,The evidence shows that the average annual ene...,"[{'chunk_id': '42427f576b66bd76', 'chunk_text'...",mistralai/Mistral-7B-Instruct-v0.3,v1,5.0,"[42427f576b66bd76, e5ef0030e06a683d, 81b4f30e8...","[42427f576b66bd76, 42427f576b66bd76]"
