In [1]:
from glob import glob
import pandas as pd
import numpy as np
import os
os.chdir("/home/gridsan/qwang/urban-control/")


In [26]:
prompt_version = 1
output_id = f"20250416_v{prompt_version}"
prompt_paths = glob(f"./data/prompts/prompts_v{prompt_version}/*.csv")


In [None]:
image_paths = glob("data/satellite_tiles/**/*.png", recursive=True)
condition_image_paths = glob("data/satellite_tiles_control_*/**/*.png", recursive=True)
zoom_level = 16


In [25]:
if prompt_version == 2:
    datasets = []

    for prompt_path in prompt_paths:
        city_name, x_offset, y_offset, _, _ ,_ = prompt_path.split(f'prompts_v{prompt_version}/')[1].split('_')
        image_path_prefix = f"data/satellite_tiles/{city_name}/{zoom_level}+{int(x_offset)}+{int(y_offset)}/"
        condition_path_prefix = "data/satellite_tiles_control_{primary_landuse}/{city_name}/{zoom_level}+{x_offset}+{y_offset}/"

        def get_condition_path_prefix(row):
            return condition_path_prefix.format(primary_landuse = row['primary_landuse'],
                                                city_name = city_name,
                                                zoom_level = zoom_level,
                                                x_offset = x_offset,
                                                y_offset = y_offset)

        df = pd.read_csv(prompt_path)
        df = df[(df['area_m2_forest']<=0.5)|(df['area_m2_forest'].isna())]
        df = df[(df['area_m2_farmland'].isna())|(df['area_m2_farmland']<=0.5)]
        if city_name == 'la':
            df['random'] = np.random.rand(len(df))
            df = df[(df['area_m2_residential']<=0.8)|(df['random']>0.5)]
        dataset_df = df
        dataset_df['primary_landuse'] = dataset_df['primary_landuse'].fillna("base")
        dataset_df['image_column'] = image_path_prefix + df['x'].astype(int).astype(str) + '/' + df['y'].astype(int).astype(str) + '.png'
        dataset_df['conditioning_image_column'] = dataset_df.apply(get_condition_path_prefix, axis=1)
        dataset_df['conditioning_image_column'] = dataset_df['conditioning_image_column'] + df['x'].astype(int).astype(str) + '/' + df['y'].astype(int).astype(str) + '.png'
        dataset_df['caption'] = df['land_use_description']
        dataset_df['city_name'] = city_name
        dataset_df['x_offset'] = x_offset
        dataset_df['y_offset'] = y_offset
        datasets.append(dataset_df[['city_name','x','y','x_offset','y_offset','image_column', 'conditioning_image_column', 'caption']])

    output_df = pd.concat(datasets)
    output_df = output_df[output_df['image_column'].isin(image_paths)]
    output_df = output_df[output_df['conditioning_image_column'].isin(condition_image_paths)]

    print(output_df.groupby(["city_name"])['image_column'].count())    
    output_df.to_csv(f'./data/train/{output_id}.csv', index=False)

    output_df['random'] = np.random.rand(len(output_df))
    train_df = output_df[output_df['random']>=0.025]
    train_df.to_csv(f'./data/train/{output_id}_train.csv', index=False)
    validation_df = output_df[output_df['random']<0.025]
    validation_df.to_csv(f'./data/train/{output_id}_validation.csv', index=False)
    print(len(train_df), "training samples.")
    print(len(validation_df), "validation samples.")
    print("All descriptions with images", len(output_df))


city_name
chicago    34340
dallas     26915
la         32319
Name: image_column, dtype: int64
91185 training samples.
2389 validation samples.
All descriptions with images 93574


In [27]:
if prompt_version in (0,1,3):
    datasets = []

    for prompt_path in prompt_paths:
        tmp = prompt_path.split(f'prompts/prompts_v{prompt_version}/')[1].split('_')
        city_name = tmp[0]
        x_offset = int(tmp[1])
        y_offset = int(tmp[2])
        image_path_prefix = f"data/satellite_tiles/{city_name}/{zoom_level}+{x_offset}+{y_offset}/"
        condition_path_prefix = f"data/satellite_tiles_control_base/{city_name}/{zoom_level}+{x_offset}+{y_offset}/"
        
        df = pd.read_csv(prompt_path)
        if 'area_m2_forest' in df:
            df = df[(df['area_m2_forest']<=0.5)|(df['area_m2_forest'].isna())]
        if 'area_m2_farmland' in df:
            df = df[(df['area_m2_farmland'].isna())|(df['area_m2_farmland']<=0.5)]
        if city_name == 'la':
            df['random'] = np.random.rand(len(df))
            df = df[(df['area_m2_residential']<=0.8)|(df['random']>0.5)]
        
        df = df.rename(columns={'xtile':'x', 'ytile':'y', 'final_description':'land_use_description'})
        if (prompt_version == 0) & ((x_offset != 0)|(y_offset!=0)):
            df['x'] = df['x'].str[:5].astype(int)
            df['y'] = df['y'].str[:5].astype(int)
        dataset_df = df
        dataset_df['image_column'] = image_path_prefix + dataset_df['x'].astype(str) + '/' + dataset_df['y'].astype(str) + '.png'
        dataset_df['conditioning_image_column'] = condition_path_prefix + dataset_df['x'].astype(str) + '/' + dataset_df['y'].astype(str) + '.png'
        dataset_df['caption'] = df['land_use_description']
        dataset_df['city_name'] = city_name
        dataset_df['x_offset'] = x_offset
        dataset_df['y_offset'] = y_offset
        datasets.append(dataset_df[['city_name','x','y','x_offset','y_offset','image_column', 'conditioning_image_column', 'caption']])

    output_df = pd.concat(datasets)
    print("All descriptions", len(output_df))

    output_df = output_df[output_df['image_column'].isin(image_paths)]
    output_df = output_df[output_df['conditioning_image_column'].isin(condition_image_paths)]
    output_df.to_csv(f'./data/train/{output_id}.csv', index=False)
    print("All descriptions with images", len(output_df))

    output_df['random'] = np.random.rand(len(output_df))

    train_df = output_df[output_df['random']>=0.025]
    train_df.to_csv(f'./data/train/{output_id}_train.csv', index=False)
    validation_df = output_df[output_df['random']<0.025]
    validation_df.to_csv(f'./data/train/{output_id}_validation.csv', index=False)
    print(len(train_df), "training samples.")
    print(len(validation_df), "validation samples.")


All descriptions 138940
All descriptions with images 82140
80036 training samples.
2104 validation samples.


In [48]:
validation_df

Unnamed: 0,city_name,x,y,x_offset,y_offset,image_column,conditioning_image_column,caption,random
24,la,11166,26137,0,0,data/satellite_tiles/la/16+0+0/11166/26137.png,data/satellite_tiles_control_base/la/16+0+0/11...,Satellite image in the city in la. Landuse inc...,0.016123
63,la,11168,26134,0,0,data/satellite_tiles/la/16+0+0/11168/26134.png,data/satellite_tiles_control_base/la/16+0+0/11...,Satellite image in the city in la.landuse incl...,0.020610
158,la,11172,26142,0,0,data/satellite_tiles/la/16+0+0/11172/26142.png,data/satellite_tiles_control_base/la/16+0+0/11...,Satellite image in the city in la. Landuse inc...,0.019180
168,la,11173,26124,0,0,data/satellite_tiles/la/16+0+0/11173/26124.png,data/satellite_tiles_control_base/la/16+0+0/11...,Satellite image in the city in la. Landuse inc...,0.012933
183,la,11173,26144,0,0,data/satellite_tiles/la/16+0+0/11173/26144.png,data/satellite_tiles_control_base/la/16+0+0/11...,Satellite image in the city in la.landuse incl...,0.005416
...,...,...,...,...,...,...,...,...,...
17533,chicago,16824,24440,5,5,data/satellite_tiles/chicago/16+5+5/16824/2444...,data/satellite_tiles_control_base/chicago/16+5...,Satellite image in a village in chicago. Landu...,0.018081
17599,chicago,16826,24421,5,5,data/satellite_tiles/chicago/16+5+5/16826/2442...,data/satellite_tiles_control_base/chicago/16+5...,"City: chicago. Landuse include: 45% forest , r...",0.014267
17619,chicago,16827,24386,5,5,data/satellite_tiles/chicago/16+5+5/16827/2438...,data/satellite_tiles_control_base/chicago/16+5...,Satellite image in the city in chicago. Landus...,0.010053
17680,chicago,16828,24441,5,5,data/satellite_tiles/chicago/16+5+5/16828/2444...,data/satellite_tiles_control_base/chicago/16+5...,Satellite image in a village in chicago. Landu...,0.009372
