In [1]:
import json
import uuid

# Load both JSON files
with open("engage_bellingham_guestbook_comments2.json", "r") as f:
    comments = json.load(f)

with open("engage_bellingham_questions2.json", "r") as f:
    questions = json.load(f)

# Normalize to embedding format
def normalize(entry, entry_type):
    return {
        "id": f"{entry_type}-{uuid.uuid4().hex[:8]}",
        "text": entry["content"],
        "metadata": {
            "type": entry_type,
            "author": entry.get("author"),
            "timestamp": entry.get("timestamp"),
            "project_title": entry.get("project_title"),
            "source_url": entry.get("source_url"),
        }
    }

combined = [normalize(c, "comment") for c in comments] + [normalize(q, "question") for q in questions]

# Save to new file
with open("engage_bellingham_combined.json", "w") as f:
    json.dump(combined, f, indent=2)

print(f"✅ Merged {len(combined)} records into engage_bellingham_combined.json")

✅ Merged 465 records into engage_bellingham_combined.json


In [None]:
import os
import json
import numpy as np
from openai import OpenAI
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from dotenv import load_dotenv
load_dotenv()


# Initialize OpenAI client

client = OpenAI()

# Load your combined comments + questions JSON
with open("engage_bellingham_combined.json", "r") as f:
    data = json.load(f)

# Step 1: Embed all texts
def embed_texts(texts, model="text-embedding-3-small", batch_size=100):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i:i + batch_size]
        response = client.embeddings.create(input=batch, model=model)
        # Ensure results are in order
        sorted_embeddings = sorted(response.data, key=lambda e: e.index)
        embeddings.extend([e.embedding for e in sorted_embeddings])
    return embeddings

texts = [item["text"] for item in data]
embeddings = embed_texts(texts)

# Step 2: Build similarity matrix
embedding_matrix = np.array(embeddings)
similarity = cosine_similarity(embedding_matrix)

# Step 3: Compute R-NN scores
reverse_scores = {i: 0 for i in range(len(data))}
appearances = {i: [] for i in range(len(data))}

for i in range(len(data)):
    sims = similarity[i]
    top_indices = np.argsort(sims)[::-1]
    top_indices = [idx for idx in top_indices if idx != i][:10]

    for rank, idx in enumerate(top_indices):
        score = 10 - rank  # 10 for 1st, 1 for 10th
        reverse_scores[idx] += score
        appearances[idx].append((i, rank + 1))

# Step 4: Rank results
ranked = sorted(
    [
        {
            "id": data[i]["id"],
            "score": reverse_scores[i],
            "text": data[i]["text"],
            "appearances": appearances[i],
            "meta": data[i]["metadata"],
        }
        for i in range(len(data))
    ],
    key=lambda x: x["score"],
    reverse=True
)

# Step 5: Save ranked results
with open("engage_bellingham_rnn_ranked.json", "w") as f:
    json.dump(ranked, f, indent=2)

print(f"✅ Finished! Top item: {ranked[0]['id']} with score {ranked[0]['score']}")


Embedding: 100%|██████████| 5/5 [00:09<00:00,  1.98s/it]

✅ Finished! Top item: comment-431f8233 with score 383





In [4]:
import os
import json
import numpy as np
from openai import OpenAI
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
load_dotenv()

# Initialize OpenAI client
client = OpenAI()

# Load your combined comments + questions JSON
with open("engage_bellingham_combined.json", "r") as f:
    data = json.load(f)

# Step 1: Embed all texts
def embed_texts(texts, model="text-embedding-3-small", batch_size=100):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i:i + batch_size]
        response = client.embeddings.create(input=batch, model=model)
        sorted_embeddings = sorted(response.data, key=lambda e: e.index)
        embeddings.extend([e.embedding for e in sorted_embeddings])
    return embeddings

texts = [item["text"] for item in data]
embeddings = embed_texts(texts)

# Step 2: Build similarity matrix
embedding_matrix = np.array(embeddings)
similarity = cosine_similarity(embedding_matrix)

# Step 3: Compute R-NN scores with full ranking
N = len(data)
reverse_scores = {i: 0.0 for i in range(N)}
appearances = {i: [] for i in range(N)}

for j in range(N):
    sims = similarity[j]
    ranked_indices = np.argsort(sims)[::-1]
    ranked_indices = [i for i in ranked_indices if i != j]  # exclude self

    for rank, i in enumerate(ranked_indices):
        weight = N - rank - 1  # highest rank (0) → weight N-1
        reverse_scores[i] += weight
        appearances[i].append((j, rank + 1))  # +1 for 1-based rank in display

# Step 4: Rank results
ranked = sorted(
    [
        {
            "id": data[i]["id"],
            "score": int(reverse_scores[i]),
            "text": data[i]["text"],
            "appearances": appearances[i],
            "meta": data[i]["metadata"],
        }
        for i in range(N)
    ],
    key=lambda x: x["score"],
    reverse=True
)

# Step 5: Save ranked results
with open("engage_bellingham_rnn_ranked.json", "w") as f:
    json.dump(ranked, f, indent=2)

print(f"✅ Finished! Top item: {ranked[0]['id']} with score {ranked[0]['score']}")


Embedding: 100%|██████████| 5/5 [00:07<00:00,  1.48s/it]


✅ Finished! Top item: comment-a3266620 with score 192343


In [6]:
import os
import json
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()
client = OpenAI()

INPUT_FILE = "engage_bellingham_rnn_ranked.json"
OUTPUT_TREE = "engage_bellingham_narrative_tree.json"
OUTPUT_FINAL = "engage_bellingham_final_narrative.md"

MODEL = "gpt-4.1-mini"
CHUNK_SIZE = 20

# Load ranked comments
with open(INPUT_FILE, "r") as f:
    ranked = json.load(f)

# Adaptive prompt for GPT summarization
def build_prompt(texts):
    sentence_limit = 5 if len(texts) >= 10 else 3
    word_limit = 80
    return f"""
You are summarizing community comments for city council.

Below are {len(texts)} public comments. Write a fair and concise summary of what these people are collectively saying.

- Be neutral and capture shared hopes, concerns, and any disagreements.
- Limit your answer to {sentence_limit} short sentences (max {word_limit} words).
- If the comments are short or similar, a shorter summary is fine.
- Return only the summary paragraph — no titles, no bullet points.

Comments:
{chr(10).join(f"- {t}" for t in texts)}
""".strip()

# Call OpenAI for summarization
def summarize_texts(texts):
    prompt = build_prompt(texts)
    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
    )
    return response.choices[0].message.content.strip()

# Recursively summarize and build tree
def build_tree(nodes, level=0):
    if len(nodes) == 1:
        node = nodes[0]
        if "summary" in node:
            return node  # already a summary node
        return {
            "text": node["text"],
            "source_ids": [node["id"]],
            "meta": node.get("meta", {}),
            "level": level
        }

    next_level_nodes = []
    for i in range(0, len(nodes), CHUNK_SIZE):
        group = nodes[i:i + CHUNK_SIZE]
        texts = [n["text"] if "text" in n else n["summary"] for n in group]
        source_ids = []
        for n in group:
            if "source_ids" in n:
                source_ids.extend(n["source_ids"])
            elif "id" in n:
                source_ids.append(n["id"])

        summary = summarize_texts(texts)

        next_level_nodes.append({
            "summary": summary,
            "source_ids": source_ids,
            "level": level,
            "children": group
        })

    return build_tree(next_level_nodes, level + 1)

# Initialize leaves from RNN-ranked comments
leaves = [
    {
        "text": item["text"],
        "id": item["id"],
        "meta": item["meta"],
        "score": item["score"]
    }
    for item in ranked
]

# Build full tree
print("🔁 Building recursive narrative tree...")
tree = build_tree(leaves)

# Save full tree
with open(OUTPUT_TREE, "w") as f:
    json.dump(tree, f, indent=2)

# Save final summary
final_summary = tree.get("summary", tree.get("text", ""))
with open(OUTPUT_FINAL, "w") as f:
    f.write("# Final Narrative Summary\n\n")
    f.write(final_summary)
    f.write(f"\n\n---\n_This summary represents the collective voice of {len(ranked)} community members._\n")

print("✅ Done!")
print(f"📝 Final summary saved to: {OUTPUT_FINAL}")
print(f"🌳 Full tree saved to: {OUTPUT_TREE}")


🔁 Building recursive narrative tree...
✅ Done!
📝 Final summary saved to: engage_bellingham_final_narrative.md
🌳 Full tree saved to: engage_bellingham_narrative_tree.json


In [None]:
# 📦 Required: !pip install openai tqdm scikit-learn python-dotenv
import os
import json
import numpy as np
from openai import OpenAI
from tqdm import tqdm
from sklearn.cluster import DBSCAN
from dotenv import load_dotenv

# --- Load environment & OpenAI client ---
load_dotenv()
client = OpenAI()

# --- Load comment/question data ---
with open("engage_bellingham_combined.json", "r") as f:
    data = json.load(f)

texts = [item["text"] for item in data]

# --- Step 1: Extract atomic suggestions using GPT-4 with structured output ---
def extract_suggestions_structured(text, model="gpt-4.1-mini"):
    messages = [
        {"role": "system", "content": "You are a helpful assistant that extracts clear, concise suggestions."},
        {"role": "user", "content": f"Extract distinct, actionable suggestions from the following citizen comment or question. Return as a JSON array of strings.\n\n{text}"}
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "suggestion_extraction",
                "schema": {
                    "type": "object",
                    "properties": {
                        "suggestions": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["suggestions"],
                    "additionalProperties": False
                },
                "strict": True
            }
        },
        temperature=0.2
    )
    parsed = json.loads(response.choices[0].message.content)
    return parsed["suggestions"]

# --- Step 2: Collect all extracted suggestions ---
all_suggestions = []
for item in tqdm(data, desc="Extracting Suggestions"):
    suggestions = extract_suggestions_structured(item["text"])
    all_suggestions.extend(suggestions)



Extracting Suggestions: 100%|██████████| 465/465 [11:55<00:00,  1.54s/it]
Embedding Suggestions: 100%|██████████| 18/18 [00:18<00:00,  1.00s/it]


✅ DBSCAN complete. 41 suggestion clusters saved to 'engage_bellingham_suggestion_clusters.json'.
🟨 Noise suggestions (unclustered): 1645


In [None]:
# --- Step 3: Embed suggestions ---
def embed_texts(texts, model="text-embedding-3-small", batch_size=100):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Suggestions"):
        batch = texts[i:i + batch_size]
        response = client.embeddings.create(input=batch, model=model)
        sorted_embeddings = sorted(response.data, key=lambda e: e.index)
        embeddings.extend([e.embedding for e in sorted_embeddings])
    return np.array(embeddings)

embedding_matrix = embed_texts(all_suggestions)



In [44]:
# --- Step 4: Run DBSCAN on suggestion embeddings ---
dbscan = DBSCAN(eps=0.27, min_samples=2, metric='cosine')
labels = dbscan.fit_predict(embedding_matrix)

# --- Step 5: Organize results into clusters ---
clusters = {}
for idx, label in enumerate(labels):
    if label == -1:
        continue  # Skip noise
    label_str = str(label)
    if label_str not in clusters:
        clusters[label_str] = []
    clusters[label_str].append({
        "suggestion": all_suggestions[idx]
    })

# --- Step 6: Save output ---
with open("engage_bellingham_suggestion_clusters.json", "w") as f:
    json.dump(clusters, f, indent=2)

print(f"✅ DBSCAN complete. {len(clusters)} suggestion clusters saved to 'engage_bellingham_suggestion_clusters.json'.")
print(f"🟨 Noise suggestions (unclustered): {list(labels).count(-1)}")


✅ DBSCAN complete. 156 suggestion clusters saved to 'engage_bellingham_suggestion_clusters.json'.
🟨 Noise suggestions (unclustered): 1213


In [37]:
[clusters[cluster] for cluster in clusters if len(clusters[cluster]) > 10]

[[{'suggestion': 'Rearrange parked cars to improve line of sight for pedestrians crossing Holly Street.'},
  {'suggestion': 'Improve safety and clarity of bike lanes and parking areas on Holly Street.'},
  {'suggestion': 'Consider returning Holly Street to 3 lanes of vehicle traffic based on community feedback.'},
  {'suggestion': "Listen to citizens' concerns regarding the Holly St changes."},
  {'suggestion': 'Allow traffic to flow freely on Holly Street to reduce downtown congestion.'},
  {'suggestion': 'Address the traffic and safety issues on Holly Street.'},
  {'suggestion': 'Consider the impact of non-car street use on traffic flow, particularly on busy streets like Holly, and adjust policies accordingly.'},
  {'suggestion': 'Reevaluate the reduction of vehicle lanes on Holly Street to avoid significant decrease in vehicle capacity.'},
  {'suggestion': 'Improve bike lane design on Holly Street to ensure safety and comfort for bicyclists.'},
  {'suggestion': 'Assess the impact of

In [30]:
clusters

{'0': [{'suggestion': 'Develop and communicate a clear plan to address the homeless crisis in Bellingham.'},
  {'suggestion': 'Include specific strategies in The Bellingham Plan to help homeless individuals find housing and support services.'}],
 '1': [{'suggestion': 'Support infill development within the City'},
  {'suggestion': 'Support infill development within the City'}],
 '2': [{'suggestion': 'Invest in capital improvements for infrastructure'},
  {'suggestion': 'Invest in capital improvements for infrastructure'}],
 '3': [{'suggestion': 'Open up Urban Growth Area Reserves for new communities'},
  {'suggestion': 'Open up Urban Growth Area Reserves for new communities'}],
 '4': [{'suggestion': 'Increase availability of low barrier and no barrier housing and shelters.'},
  {'suggestion': 'Provide more low/no-barrier shelters and daytime shelters for unhoused people.'}],
 '5': [{'suggestion': 'Implement rent control to keep housing affordable.'},
  {'suggestion': 'Implement rent con

In [40]:
dic = {"suggestions" : all_suggestions}
with open("raw_suggestions.json", "w") as f:
    json.dump(dic, f, indent=2)