In [2]:
!pip install -r requirements.txt

Obtaining three from git+https://github.com/codeforamerica/three.git@67b4a4b233a57aa7995d01f6b0f69c2e85aea6c0#egg=three (from -r requirements.txt (line 14))
  Skipping because already up-to-date.
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Collecting appdirs==1.4.3 (from -r requirements.txt (line 1))
  Using cached appdirs-1.4.3-py2.py3-none-any.whl.metadata (8.8 kB)
Collecting funcsigs==1.0.2 (from -r requirements.txt (line 2))
  Using cached funcsigs-1.0.2-py2.py3-none-any.whl.metadata (14 kB)
Collecting lxml>=4.9.3 (from -r requirements.tx

In [4]:
import re
import json
import os
import requests
import urllib
from tqdm import tqdm
from three import Three
from datetime import datetime, timedelta
from collections import defaultdict
from urllib.request import urlretrieve
from urllib.parse import urlparse

## Data Collection

In [None]:
FMS_BASE_URL = "https://www.fixmystreet.com/open311/v2/"
FMS_BASE_PARAMETERS = {
    "jurisdiction_id": "fixmystreet",
}
DATA_FOLDER = "../data/1. Original"

CATEGORIES = [
    "Abandoned vehicles",
    "Blocked drainage gullies",
    "Bus stops",
    "Car parking",
    "Dog fouling",
    "Flyposting",
    "Flytipping",
    "Graffiti",
    "Parks/landscapes",
    "Pavements/footpaths",
    "Potholes",
    "Public toilets",
    "Roads/highways",
    "Road traffic signs",
    "Rubbish (refuse and recycling)",
    "Street cleaning",
    "Street lighting",
    "Street nameplates",
    "Traffic lights",
    "Trees",
]

In [None]:
class FMS(Three):
    def __init__(self):
        super(FMS, self).__init__()
        self.endpoint = FMS_BASE_URL
        self.format = "xml"
        self.jurisdiction = FMS_BASE_PARAMETERS['jurisdiction_id']


def make_a_category_folder(category):
    folder_path = os.path.join(DATA_FOLDER, category)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)


def setup_folders():
    """ Initialise the folder structure for training if it doesn't exist"""
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    for cat in CATEGORIES:
        cat = re.sub(r'[<>:"/\\|?*]', ' ', cat)
        make_a_category_folder(cat)


In [None]:
RELEVANT_KEYS = {
    'name', 'place', 'amenity', 'landuse', 'leisure', 'building',
    'highway', 'natural', 'shop', 'tourism', 'man_made', 'railway'
}

def filter_tags(tags: list[dict], useful_keys: set[str]) -> list[dict]:
    """
    Filter a list of tag dictionaries to keep only useful keys.

    Parameters:
        tags (list[dict]): A list of dictionaries containing tag key-value pairs.
        useful_keys (set[str]): A set of tag keys to keep.

    Returns:
        list[dict]: A new list of dictionaries with only the useful keys retained.
    """
    return [
        filtered_tag
        for tag in tags
        if (filtered_tag := {k: v for k, v in tag.items() if k in useful_keys})
    ]


def get_osm_tags_from_openstreetmap(lat, lon, radius=15, filter=True):
    overpass_url = "https://overpass-api.de/api/interpreter"
    tags = {"nearby": [], "enclosing": []}

    if not isinstance(lat, (int, float)) or not isinstance(lon, (int, float)):
        print("Invalid latitude or longitude values.")
        return tags
    
    # Define bounding box for out+geom (slightly expanded around the point)
    lat_min = lat - 0.0015
    lat_max = lat + 0.0015
    lon_min = lon - 0.0015
    lon_max = lon + 0.0015
    bbox = f"{lat_min},{lon_min},{lat_max},{lon_max}"

    # Nearby query
    nearby_query = f"""
    [timeout:10][out:json];
    (
      node(around:{radius},{lat},{lon});
      way(around:{radius},{lat},{lon});
    );
    out tags geom({bbox});
    relation(around:{radius},{lat},{lon});
    out geom({bbox});
    """

    # Enclosing query
    enclosing_query = f"""
    [timeout:10][out:json];
    is_in({lat},{lon})->.a;
    way(pivot.a);
    out tags bb;
    out ids geom({bbox});
    relation(pivot.a);
    out tags bb;
    """

    def run_query(query):
        response = requests.post(overpass_url, data={"data": query})
        response.raise_for_status()
        return response.json()

    try:
        nearby_data = run_query(nearby_query)
        enclosing_data = run_query(enclosing_query)
    except requests.exceptions.RequestException as e:
        print("Error querying Overpass API:", e)
        return tags


    # Collect tags from nearby results
    for element in nearby_data.get("elements", []):
        if "tags" in element:
            tags["nearby"].append(element["tags"])

    # Collect tags from enclosing results
    for element in enclosing_data.get("elements", []):
        if "tags" in element:
            tags["enclosing"].append(element["tags"])

    if filter:
        tags["nearby"] = filter_tags(tags["nearby"], RELEVANT_KEYS)
        tags["enclosing"] = filter_tags(tags["enclosing"], RELEVANT_KEYS)

    return tags


In [10]:
def grab_report_data(report):
    """ Given an single XML entry representing a single FMS report,
    grab the metadata and attached photo(s) placing in the correct folder
    to build the dataset for training."""
    if 'media_url' in report.keys():
        web_path = urlparse(report['media_url'])
        file_name = os.path.split(web_path.path)[1]

        report['service_code'] = re.sub(r'[<>:"/\\|?*]', ' ', report['service_code'])

        img_file_path = os.path.join(DATA_FOLDER,
                                     report['service_code'],
                                     file_name)
        print(report['service_code'])

        try:
            urlretrieve(report['media_url'], img_file_path)
        except FileNotFoundError as e:
            print("New service code found, because we're special")
            make_a_category_folder(report['service_code'])
            urlretrieve(report['media_url'], img_file_path)

        label_file_name = file_name.rsplit('.', maxsplit=1)[0] + ".json"
        labels_file_path = os.path.join(DATA_FOLDER,
                                        report['service_code'],
                                        label_file_name)
        
        print(labels_file_path)

        tags = get_osm_tags_from_openstreetmap(float(report["lat"]), float(report["long"]))

        with open(labels_file_path, 'w', encoding="utf-8") as f:
            json.dump({"description": report['description'], "tags": tags, "lat": report['lat'], "long": report['long']}, f, ensure_ascii=False, indent=2)


In [None]:
# create intial folders (skip if already done)
setup_folders()

In [None]:
fms_client = FMS()

# Start/End date and how many days to grab at once
start_date = datetime(2023, 1, 1)
end_date = datetime.today()
delta = timedelta(days=7)

# Query API to collect data
while start_date < end_date:
    next_date = start_date + delta
    date_range = [start_date.strftime("%m-%d-%Y"), next_date.strftime("%m-%d-%Y")]
    print(f"Fetching between: {date_range[0]} to {date_range[1]}")

    try:
        response = fms_client.requests(between=date_range, count=1000)
        reports = response.get('request', [])
        if not reports:
            print("No reports found in this range.")
        for report in reports:
            grab_report_data(report)
    except Exception as e:
        print(f"Error for date range {date_range}: {e}")

    start_date = next_date

## Data Cleaning

Data was manually sorted to ensure accuracy. Raw data was extremely messy. 

In [3]:
def build_label_vectors_with_images(data_root):
    """
    Build a dictionary mapping report_id to its labels (categories),
    and include whether the JSON file has an associated image and its path.
    The categories are ordered by their hierarchy, with higher-level categories appearing first.
    """
    label_map = defaultdict(lambda: {"labels": [], "severity": "", "image_path": None})

    for root, _, files in os.walk(data_root):
        for file in files:
            if file.endswith(".json"):
                report_id = os.path.splitext(file)[0]

                # Relative path from data_root
                rel_path = os.path.relpath(root, data_root)
                # Each part of the relative path is a label
                categories = rel_path.split(os.sep)

                # Ensure categories are ordered by hierarchy (top-level first)
                categories = sorted(categories, key=lambda x: rel_path.index(x))

                # Check for associated image
                image_extensions = ['.jpeg', '.jpg', '.png']
                image_path = None
                for ext in image_extensions:
                    potential_image_path = os.path.normpath(os.path.join(root, report_id + ext))
                    if os.path.exists(potential_image_path):
                        image_path = potential_image_path
                        break

                # Update label map
                label_map[report_id]["labels"].extend(categories)
                label_map[report_id]["labels"] = list(dict.fromkeys(label_map[report_id]["labels"]))  # Remove duplicates while preserving order
                label_map[report_id]["severity"] = "Low" # Default severity
                label_map[report_id]["image_path"] = image_path or label_map[report_id]["image_path"]  # None if no image found
            
    return label_map

# Example usage
data_root = "./data/2. Sorted"
label_vectors = build_label_vectors_with_images(data_root)

# Save to JSON
with open("label_vectors.json", "w", encoding="utf-8") as f:
    json.dump(label_vectors, f, indent=2)

In [None]:
folder_path = "../data/2. Sorted"

In [13]:
# finding the file with the most classes (for checking)

with open("label_vectors.json", "r", encoding="utf-8") as f:
    max_count = 0
    path_name = ""

    data = json.load(f)
    for k, v in data.items():
        if len(v["labels"])>max_count:
            max_count = len(v["labels"])
            path_name = k

print(max_count, path_name)

6 7284551.0.full


In [55]:
file_path = os.path.join(folder_path, "./Smoking/Other Public Areas/singapore-singapore-people-smoking-in-front-of-a-shopping-center-P24BYK.json")
lat = 1.2881984975229688
long = 103.84640096944811
description = "I hate smoke. Please enforce the no smoking law."

if lat or long:
    lat = round(float(lat), 6)
    long = round(float(long), 6)

tags = get_osm_tags_from_openstreetmap(lat, long)

with open(file_path, 'w', encoding="utf-8") as f:
    json.dump({"description": description, "tags": tags, "lat": lat, "long": long}, f, ensure_ascii=False, indent=2)