In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install openai tqdm nltk rapidfuzz

import os, json, re, random
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from openai import OpenAI
from rapidfuzz import fuzz
# ==== 2️⃣ Inspect your crawled JSONL file ====
import json
import itertools

# ==== 1. Setup ====
os.environ["OPENAI_API_KEY"] = "sk-C6WGSP_rjv9BBrT-PZf86g"
client = OpenAI()

input_path = "/content/drive/MyDrive/pittsburgh_cmu_knowledge_base.jsonl"
output_verified = "/content/drive/MyDrive/synthetic_qa_verified.jsonl"
log_path = "qa_reject_log.jsonl"

# Check file size and preview first few lines
!ls -lh "$input_path"
print("\n🔍 Previewing first few lines:\n")

with open(input_path, 'r', encoding='utf8') as f:
    for line in itertools.islice(f, 5):
        print(line.strip()[:400], "\n---")  # print first 400 chars per entry

# ==== 3️⃣ Validate structure ====
required_keys = {"url", "title", "content"}
n_ok, n_bad = 0, 0
bad_examples = []

with open(input_path, 'r', encoding='utf8') as f:
    for i, line in enumerate(f):
        try:
            data = json.loads(line)
            if not required_keys.issubset(data.keys()):
                n_bad += 1
                bad_examples.append({"line": i, "missing": list(required_keys - data.keys())})
            else:
                n_ok += 1
        except json.JSONDecodeError:
            n_bad += 1
            bad_examples.append({"line": i, "error": "Invalid JSON"})
# Open the file and read the first line
with open(input_path, "r") as f:
    first_line = f.readline()

# Parse the line as JSON
data = json.loads(first_line)

# Print all keys
print("Keys in this row:")
print(list(data.keys()))

# (Optional) inspect one key-value pair
for k, v in data.items():
    print(f"{k}: {str(v)[:100]}...")  # preview first 100 chars

print(f"\n✅ Valid entries: {n_ok}")
print(f"⚠️  Problematic entries: {n_bad}")

if n_bad:
    print("\nSample issues:")
    for b in bad_examples[:5]:
        print(b)


-rw------- 1 root root 7.4M Oct 11 05:44 /content/drive/MyDrive/pittsburgh_cmu_knowledge_base.jsonl

🔍 Previewing first few lines:

{"url": "https://www.cmu.edu/about/", "title": "About Carnegie Mellon | Carnegie Mellon University", "content": "# About Carnegie Mellon\n\n## Don’t Envision the Future, Invent It Every Day\n\nAt Carnegie Mellon University, academic and research excellence isn't just what we strive for — it's the uncompromising standard. Our entrepreneurial spirit means no pursuit is too complex, and no question i 
---
{"url": "https://www.cmu.edu/about/traditions", "title": "Traditions | Carnegie Mellon University", "content": "# Traditions\n\n## Connected to the Past, Shaping the Future\n\nAt Carnegie Mellon University (CMU), traditions are more than just events — they're a celebration of our bold and collaborative community. Some commemorate the Scottish heritage of our founder, Andrew Carnegie, while others  
---
{"url": "https://www.cmu.edu/about/vision-mission-values

In [None]:
# -*- coding: utf-8 -*-
# =====================================================
# ✅ Pittsburgh / CMU QA Generator – Fixed Version
# =====================================================

import os, json, re, random
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from openai import OpenAI
from rapidfuzz import fuzz

# ==== 1️⃣ Setup ====
os.environ["OPENAI_API_KEY"] = "sk-C6WGSP_rjv9BBrT-PZf86g"
client = OpenAI()

input_path = "/content/drive/MyDrive/pittsburgh_cmu_knowledge_base.jsonl"
output_verified = "/content/drive/MyDrive/synthetic_qa_verified.jsonl"
log_path = "/content/drive/MyDrive/qa_reject_log.jsonl"

# ✅ Make sure output files exist
for p in [output_verified, log_path]:
    if not os.path.exists(p):
        open(p, "w").close()

# ==== 2️⃣ Utilities ====
def clean_text(t):
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"(?i)(cookie|subscribe|login|privacy|footer)", "", t)
    return t.strip()

def chunk_text(text, max_len=250):
    sents = sent_tokenize(text)
    chunks, cur, cur_len = [], [], 0
    for s in sents:
        words = s.split()
        if cur_len + len(words) > max_len:
            chunks.append(" ".join(cur))
            cur = cur[-2:]
            cur_len = len(" ".join(cur).split())
        cur.extend(words)
        cur_len += len(words)
    if cur:
        chunks.append(" ".join(cur))
    return chunks

def parse_jsonl_block(raw_text):
    """Extract JSON objects from model output."""
    matches = re.findall(r'\{.*?\}', raw_text, flags=re.S)
    results = []
    for m in matches:
        try:
            results.append(json.loads(m))
        except:
            pass
    return results

# ==== 3️⃣ Load and preprocess documents ====
docs = []
with open(input_path, "r", encoding="utf8") as f:
    for line in f:
        try:
            d = json.loads(line)
            text = clean_text(d.get("content", ""))
            # Loosened filter for better coverage
            if 100 < len(text) < 4000:
                chunks = chunk_text(text)
                for c in chunks:
                    docs.append({"url": d.get("url"), "title": d.get("title"), "text": c})
        except:
            continue

random.shuffle(docs)
sampled = docs[:500]   # adjust for time/budget
print(f"📄 Loaded {len(docs)} docs. Sampling {len(sampled)} for generation.")

# ==== 4️⃣ Generate QA pairs ====
def generate_qa(text):
    prompt = f"""
You are a meticulous human annotator creating high-quality, factual question–answer (QA) pairs
for a test dataset about Pittsburgh and Carnegie Mellon University.

Given the text below, extract 2–3 diverse, accurate QA pairs that can be verified *verbatim*
from the text itself.

Annotation requirements:
- Each question must be short, specific, factual, and directly answerable from the passage.
- Use factual forms: who / what / when / where / how.
- Each answer must be copied *exactly* from the text, not paraphrased.
- If a question has multiple valid answers, list all of them separated by a semicolon (;).
- Cover diverse fact types when possible (people, places, organizations, events, dates, etc.).
- If the text includes multiple time frames, entities, or organizations, make at least one question about each.
- Include at least one question about subtle or less obvious details if present (for coverage of edge cases).
- Prioritize questions relevant to Pittsburgh or Carnegie Mellon University when applicable.
- Avoid vague, opinion-based, or unanswerable questions.
- Do not reference “the text above” or “the passage.”
- Keep answers under 20 words when possible.
- Do NOT guess, summarize, or infer; only extract facts explicitly stated in the text.
- If a fact is not explicitly stated in the text, skip it — do NOT assume or invent.
- If fewer than two valid QA pairs can be found, output only those that meet all requirements.
- Maintain consistent style for quality.
- Output in JSONL format, one line per QA pair:
  {{"question": "...", "answer": "..."}}

Text:
{text}
"""
    try:
        res = client.chat.completions.create(
            model="gpt-5",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5,
        )
        return res.choices[0].message.content
    except Exception as e:
        print("Error during generation:", e)
        return None

# ==== 5️⃣ Verification helper ====
def is_answer_in_text(answer, text):
    """Return True if answer appears in text (exact or fuzzy)."""
    if not answer or not text:
        return False
    answer_low, text_low = answer.lower(), text.lower()
    if answer_low in text_low:
        return True
    return fuzz.partial_ratio(answer_low, text_low) > 85

# ==== 6️⃣ Main loop ====
verified_count, total_generated = 0, 0

if not sampled:
    print("⚠️ No documents loaded. Check your input path or filtering conditions.")
else:
    for d in tqdm(sampled, desc="Generating QA pairs"):
        qa_text = generate_qa(d["text"])
        if not qa_text:
            continue

        qa_pairs = parse_jsonl_block(qa_text)
        for qa in qa_pairs:
            total_generated += 1
            if is_answer_in_text(qa.get("answer", ""), d["text"]):
                qa["source_url"] = d["url"]
                qa["source_title"] = d["title"]
                with open(output_verified, "a", encoding="utf8") as f:
                    f.write(json.dumps(qa, ensure_ascii=False) + "\n")
                verified_count += 1
            else:
                with open(log_path, "a", encoding="utf8") as f:
                    f.write(json.dumps({"rejected": qa, "source_excerpt": d["text"][:400]}) + "\n")

print(f"✅ Verified {verified_count} of {total_generated} QA pairs.")

# ==== 7️⃣ Post-processing cleanup ====
if not os.path.exists(output_verified) or os.stat(output_verified).st_size == 0:
    print("⚠️ No verified file found — possibly no valid QA pairs generated.")
else:
    qa_pairs = [json.loads(l) for l in open(output_verified, "r", encoding="utf8") if l.strip()]

    filtered = [
        qa for qa in qa_pairs
        if 5 < len(qa["question"]) < 120 and 1 < len(qa["answer"]) < 60
    ]
    json.dump(filtered, open("/content/drive/MyDrive/filtered_qa.json", "w"), indent=2)
    print(f"✨ Final cleaned set: {len(filtered)} verified & length-filtered QA pairs.")


📄 Loaded 0 docs. Sampling 0 for generation.
⚠️ No documents loaded. Check your input path or filtering conditions.
✅ Verified 0 of 0 QA pairs.
⚠️ No verified file found — possibly no valid QA pairs generated.


In [None]:
import json

# Load your verified + filtered data
with open("/content/drive/MyDrive/filtered_qa.json") as f:
    qa_pairs = json.load(f)

# Create numbered mapping
questions = []
ref_answers = {}
for i, qa in enumerate(qa_pairs, start=1):
    questions.append(qa["question"])
    ref_answers[str(i)] = qa["answer"]

# Save to files
with open("/content/drive/MyDrive/questions.txt", "w") as fq:
    fq.write("\n".join(questions))

with open("/content/drive/MyDrive/reference_answers.json", "w") as fa:
    json.dump(ref_answers, fa, indent=2)

print(f"✅ Saved {len(questions)} training questions and answers in assignment format.")
