In [30]:
import pandas as pd
pd.read_csv('stops.csv')[['stop_name', 'stop_lat', 'stop_lon']].groupby('stop_name').agg({'stop_lat': 'mean', 'stop_lon': 'mean'}).reset_index().to_csv('stops_deduped.csv', index=False)

In [1]:
import json
import pandas as pd
from difflib import get_close_matches
from openai import OpenAI
import os, json
from dotenv import load_dotenv
import json

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

with open("posts.json", "r", encoding="utf-8") as f:
    posts = json.load(f)

df = pd.DataFrame(posts["posts"])



stops = pd.read_csv("stops_deduped.csv")

def find_stop(query: str, stops_df: pd.DataFrame):
    names = stops_df["stop_name"].tolist()
    match = get_close_matches(query, names, n=1, cutoff=0.4)
    if match:
        stop = stops_df[stops_df["stop_name"] == match[0]].iloc[0]
        return {
            "matched_name": stop["stop_name"],
            "lat": stop["stop_lat"],
            "lon": stop["stop_lon"]
        }
    else:
        return None

In [8]:
def generate_prompt(text: str) -> str:
    return f"""Analyze the transport alert below and return only valid JSON with:
    - "description": under 5 words summarizing what happened in Polish,
    - "category": "incident" (accident, crash, obstruction) or "traffic" (delay, congestion),
    - "loc": most likely cracovian stop name (stop, street, or area).
    If info missing, use null.
    Text: {text}"""

In [None]:
def get_incident_from_post(client, post: str):
    text = post["text"].lower()
    prompt = generate_prompt(text)
    
    if "#utrudnienia_pkk" not in text:
        return None
        
    response = client.chat.completions.create(
        model="gpt-5-nano",
        messages=[{"role": "user", "content": prompt}]
    )

    result = response.choices[0].message.content
    try:
        parsed = json.loads(result)
        loc = parsed.get("loc", 'Dworzec tunel')
        stop_found = find_stop(loc, stops) if loc is not None else None 
        category = parsed.get("category", "incident").lower()
        return {
            "post_id": post["id"],
            "description": parsed.get("description", ""),
            "category": category if category in ["incident", "traffic"] else "traffic",
            "lat": stop_found["lat"] if stop_found else None,
            "lon": stop_found["lon"] if stop_found else None,
        }
    except json.JSONDecodeError as e:
        return None
    return None

In [12]:
results = []
for post in posts['posts']:
    text = post["text"].lower()
    print(f"\n📝 Processing post #{post['id']}...\n{text}\n")

    prompt = f"""
    Analyze the transport alert below and return only valid JSON with:
    - "description": under 5 words summarizing what happened in Polish,
    - "category": "incident" (accident, crash, obstruction) or "traffic" (delay, congestion),
    - "loc": most likely cracovian stop name (stop, street, or area).
    If info missing, use null.
    Text: {text}
    """
    if "#utrudnienia_pkk" not in text:
        continue
        
    response = client.chat.completions.create(
        model="gpt-5-nano",
        messages=[{"role": "user", "content": prompt}]
    )

    result = response.choices[0].message.content
    try:
        parsed = json.loads(result)
        print("✅ Parsed JSON:", parsed)
        loc = parsed.get("loc", 'Dworzec tunel')
        stop_found = find_stop(loc, stops) if loc is not None else None 
        category = parsed.get("category", "incident").lower()
        results.append({
            "post_id": post["id"],
            "description": parsed.get("description", ""),
            "category": category if category in ["incident", "traffic"] else "traffic",
            "stop_name": stop_found["matched_name"] if stop_found else None,
            "latitude": stop_found["lat"] if stop_found else None,
            "longitude": stop_found["lon"] if stop_found else None,
        })
    except json.JSONDecodeError as e:
        print("❌ Failed to parse JSON:", e)
    print("🔍 Result:\n", result)
    break


📝 Processing post #0...
#utrudnienia_pkk (20:40) potrącenie pieszego przy miśnieńskiej. brak dojazdu do pętli mistrzejowice. linia 14 kierowana do os. piastów.

✅ Parsed JSON: {'description': 'Potrącenie pieszego', 'category': 'incident', 'loc': 'ul. Miśnieńska'}
🔍 Result:
 {
  "description": "Potrącenie pieszego",
  "category": "incident",
  "loc": "ul. Miśnieńska"
}


In [7]:
# save json of results
with open("parsed_posts.json", "w", encoding="utf-8") as f:
    json.dump({"results": results}, f, ensure_ascii=False, indent=2)