## Short description (v3)

In [3]:
import pandas as pd
import random
from collections import defaultdict
from glob import glob

import os
os.chdir("/home/gridsan/qwang/urban-control/")

landuse_paths = glob("./data/landuse_overlay/*_filtered.csv")
prompt_path = "./data/prompts/prompts_v3/{filename}_w_description.csv"

for landuse_path in landuse_paths:
    city_name = landuse_path.split("landuse_overlay/")[1].split('_')[0]
    df = pd.read_csv(landuse_path)

    description_threshold = 0.04
    tot_building_footprint_threshold = [(0.3, 'high'), (0.15, 'medium'), (0.03, 'low')]

    # Land use columns (excluding settlement types)
    land_use_cols = [
        'area_m2_commercial',
        'area_m2_farmland',
        'area_m2_forest',
        'area_m2_industrial',
        'parking',
        'area_m2_recreational',
        'area_m2_residential',
        'water'
    ]

    land_use_cols_names = {
        'area_m2_commercial': 'commercial',
        'area_m2_farmland': 'farmland',
        'area_m2_forest': 'forest',
        'area_m2_industrial': 'industrial',
        'parking': 'parking',
        'area_m2_recreational': 'recreational',
        'area_m2_residential': 'residential',
        'water': 'water'
    }

    building_type_cols = [
        'apartments',
        'house',
        #'res_unkonwn',
        'townhouse'
    ]

    # Add building type names mapping
    building_type_names = {
        'apartments': 'apartment complexes',
        'house': 'single-family homes',
        #'res_unkonwn': 'unclassified residences',  # Note: typo in column name?
        'townhouse': 'townhouses'
    }

    # Settlement type columns
    settlement_cols = [
        'city', 'farm', 'island', 'town', 'village'
    ]

    # Phrase variations for richer descriptions
    PHRASE_VARIANTS = {
        'primary_landuse': [
            "Landuse include: {pct:.0f}% {name}"
        ],
        'secondary_landuse': [
            ", {names}"
        ],
        'building_footprint': [
            ". Buildings {pct:.0f}% ",
        ],
        'building_density': [
            ". {density} building density. ",
        ],
        'primary_settlement': [
            "Satellite image in a {type} in {city_name}.",
        ],
        'secondary_settlement': [
            ", {types} "
        ],
        'no_landuse': [
            "The terrain shows minimal developed land uses",
            "This plot appears largely undeveloped",
            "Natural features dominate this area",
            "Few man-made structures are evident here"
        ],
        'no_settlement': [
            "City: {city_name}"
            # "No formal settlements are designated here",
            # "The area lacks defined community boundaries",
            # "This zone appears unincorporated",
            # "No clear residential patterns emerge"
        ],
        'primary_building': [
            "Residential type is mainly {type}"
        ],
        # 'primary_building': [
        #     "The residential buildings are mainly {type} ({pct:.0f}%)",
        #     "Housing consists primarily of {type} ({pct:.0f}%)",
        #     "{type} structures ({pct:.0f}%) dominate the residential areas",
        #     "You'll find mostly {type} ({pct:.0f}%) here"
        # ],
        'secondary_building': [
            ", with {types}",
        ],
        'no_building': [
            "No significant residential buildings are present",
        ]
    }

    # Contextual descriptions for specific combinations
    CONTEXTUAL_DESCRIPTIONS = {
        ('industrial', 'city'): [
            "This urban industrial sector features manufacturing and warehouses",
            "Factory buildings dominate this city district",
            "An industrial park within the city limits"
        ],
        ('residential', 'city'): [
            "A bustling urban neighborhood with housing developments",
            "Residential blocks characterize this city area",
            "Residential buildings fill this urban zone"
        ],
        ('farmland', 'farm'): [
            "Agricultural fields stretch across this rural property",
            "Crop cultivation dominates this farming area",
            "This productive farmland shows organized planting patterns"
        ],
        ('forest', 'water'): [
            "A wooded area near water features creates a natural habitat",
            "Forest meets water body in this ecosystem",
            "Trees cluster near bodies of water in this preserve"
        ]
    }

    def get_context_description(land_use, settlement):
        for (lu, st), descriptions in CONTEXTUAL_DESCRIPTIONS.items():
            if lu in land_use.lower() and st in settlement.lower():
                return random.choice(descriptions)
        return None

    def round_to_5(percentage):
        round_percentage = round(percentage / 0.05) * 5
        return round_percentage

    def describe_row(row):
        parts = []

        # --- Settlement Type Description ---
        settlements = []
        for col in settlement_cols:
            if not pd.isna(row[col]) and row[col] > 0:
                settlements.append((col, row[col]))

        if settlements:
            settlements_sorted = sorted(settlements, key=lambda x: -x[1])
            if settlements_sorted[0][1] > description_threshold:
                primary_type, primary_pct = settlements_sorted[0]

                # Check for contextual description first
                context_desc = None #get_context_description(parts[0], primary_type)
                if context_desc:
                    parts[0] = context_desc
                else:
                    # Standard settlement description
                    primary_phrase = random.choice(PHRASE_VARIANTS['primary_settlement'])
                    settle_desc = [primary_phrase.format(type=primary_type, city_name=city_name)]

                    # Add secondary settlements if significant
                    secondary = []
                    for st_type, st_pct in settlements_sorted[1:]:
                        if st_pct >= 0.35:
                            secondary.append(f"{st_type} ")

                    if secondary:
                        secondary_phrase = random.choice(PHRASE_VARIANTS['secondary_settlement'])
                        settle_desc.append(secondary_phrase.format(types=', '.join(secondary)))

                    parts.append(" ".join(settle_desc))
            else:
                parts.append(random.choice(PHRASE_VARIANTS['no_settlement']).format(city_name=city_name))
        else:
            parts.append(random.choice(PHRASE_VARIANTS['no_settlement']).format(city_name=city_name))

        # --- Land Use Description ---
        land_uses = []
        for col in land_use_cols:
            if not pd.isna(row[col]) and row[col] > 0:
                land_uses.append((land_use_cols_names[col], row[col]))

        if land_uses:
            land_uses_sorted = sorted(land_uses, key=lambda x: -x[1])

            if land_uses_sorted[0][1] > description_threshold:
                primary_name, primary_val = land_uses_sorted[0]

                # Start with primary land use
                primary_phrase = random.choice(PHRASE_VARIANTS['primary_landuse'])
                land_desc = [primary_phrase.format(name=primary_name, pct=round_to_5(primary_val))]

                # Add secondary land uses if significant
                secondary = []
                for name, val in land_uses_sorted[1:]:
                    if val >= description_threshold:
                        secondary.append(f"{name} ({round_to_5(val):.0f}%)")

                if secondary:
                    secondary_phrase = random.choice(PHRASE_VARIANTS['secondary_landuse'])
                    land_desc.append(secondary_phrase.format(names=', '.join(secondary)))

                # Add building footprint if available
                if not pd.isna(row.get('tot_building_footprint', 0)) and row['tot_building_footprint'] > 0:
                    tmp_density = ''
                    for idx, pair in enumerate(tot_building_footprint_threshold):
                        if row['building_density'] >= pair[0]:
                            tmp_density = pair[1]
                            break
                    if tmp_density != '':
                        footprint_phrase = random.choice(PHRASE_VARIANTS['building_density'])
                        land_desc.append(footprint_phrase.format(density=tmp_density))
                    # footprint_phrase = random.choice(PHRASE_VARIANTS['building_footprint'])
                    # land_desc.append(footprint_phrase.format(pct=round_to_5(row['tot_building_footprint'])))

                parts.append(" ".join(land_desc))
            else:
                parts.append(random.choice(PHRASE_VARIANTS['no_landuse']))
        else:
            parts.append(random.choice(PHRASE_VARIANTS['no_landuse']))

        # --- Building Type Description --- (add this new section)
        buildings = []
        for col in building_type_cols:
            if not pd.isna(row[col]) and row[col] > 0:
                buildings.append((col, row[col]))

        if buildings and any(val > 0 for _, val in buildings):
            buildings_sorted = sorted(buildings, key=lambda x: -x[1])
            primary_bldg, primary_pct = buildings_sorted[0]

            # Only describe if residential area exists
            if not pd.isna(row['area_m2_residential']) and row['area_m2_residential'] > 0:
                primary_phrase = random.choice(PHRASE_VARIANTS['primary_building'])
                bldg_desc = [primary_phrase.format(
                    type=building_type_names[primary_bldg])]
                    # ,
                    # pct=round_to_5(primary_pct / row['tot_building_footprint']))]

                # Add secondary building types if significant
                secondary = []
                for bldg, pct in buildings_sorted[1:]:
                    tmp_pct = pct / row['tot_building_footprint']
                    if tmp_pct >= description_threshold:
                        secondary.append(f"{building_type_names[bldg]}")
                        # secondary.append(f"{building_type_names[bldg]} ({round_to_5(tmp_pct):.0f}%)")

                if secondary:
                    secondary_phrase = random.choice(PHRASE_VARIANTS['secondary_building'])
                    bldg_desc.append(secondary_phrase.format(types=', '.join(secondary)))

                parts.append(" ".join(bldg_desc))
        else:
            if not pd.isna(row['area_m2_residential']) and row['area_m2_residential'] > 0:
                pass #parts.append(random.choice(PHRASE_VARIANTS['no_building']))

        # --- Clean and filter empty parts ---
        parts = [p.strip() for p in parts if p and str(p).strip()]

        # --- Ensure each part ends with proper punctuation ---
        processed_parts = []
        for p in parts:
            p = p.strip()
            if not p.endswith(('.', '!', '?')):
                p += '.'
            processed_parts.append(p)

        # --- Randomly choose connection style ---
        if random.random() > 0.5 and len(processed_parts) > 1:
            # Combine as one sentence with connectors
            connectors = [
                ""
            ]
            description = processed_parts[0]
            for part in processed_parts[1:]:
                description += random.choice(connectors) + part[0].lower() + part[1:]
        else:
            # Combine as separate sentences
            description = " ".join(processed_parts)

        # --- Final cleanup ---
        description = description.replace("..", ".")  # Fix double periods
        #description = description.replace("a city", "the city")
        return description


    df['land_use_description'] = df.apply(describe_row, axis=1)
    df['city_name'] = city_name

    # Save to new CSV (optional)
    filename = landuse_path.split("landuse_overlay/")[1].split  ('.csv')[0]
    output_path = prompt_path.format(filename=filename)
    df.to_csv(output_path, index=False)
    print(f"\nDataFrame with descriptions saved to: {output_path}")


DataFrame with descriptions saved to: ./data/prompts/prompts_v3/la_0_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v3/dallas_0_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v3/dallas_5_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v3/la_3_7_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v3/la_7_3_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v3/la_5_0_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v3/dallas_3_7_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v3/la_5_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v3/dallas_7_3_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v3/dallas_5_0_filtered_w_descri

# V2

In [4]:
import pandas as pd
import random
from collections import defaultdict
from glob import glob
import os

compute = 'SuperCloud'
landuse_paths = []
prompt_path = []

if compute == 'SuperCloud':
    os.chdir("/home/gridsan/qwang/urban-control/")

    landuse_paths = glob("./data/landuse_overlay/*_filtered.csv")
    prompt_path = "./data/prompts/prompts_v2/{filename}_w_description.csv"
elif compute == 'kxu_mac':
    landuse_paths = glob("./data/LANDUSE_CALCS/*_filtered.csv")
    prompt_path = "./data/LANDUSE_CALCS_DESCRIPTION/{filename}_w_description.csv"
else:
    print("WARNING: no path configuration")

# Land use columns (excluding settlement types)
land_use_cols = [
    'area_m2_commercial',
    'area_m2_farmland',
    'area_m2_forest',
    'area_m2_industrial',
    'parking',
    'area_m2_recreational',
    'area_m2_residential',
    'water'
]

shaded_area_cols = [
    'area_m2_commercial',
    'area_m2_farmland',
    'area_m2_forest',
    'area_m2_industrial',
    'area_m2_residential'
]

land_use_cols_names = {
    'area_m2_commercial': 'commercial',
    'area_m2_farmland': 'farmland',
    'area_m2_forest': 'forest',
    'area_m2_industrial': 'industrial',
    'parking': 'parking',
    'area_m2_recreational': 'recreational',
    'area_m2_residential': 'residential',
    'water': 'water'
}

building_type_cols = [
    'apartments',
    'house',
    # 'res_unkonwn',
    'townhouse'
]

# Add building type names mapping
building_type_names = {
    'apartments': 'apartment complexes',
    'house': 'single-family homes',
    # 'res_unkonwn': 'unclassified residences',  # Note: typo in column name?
    'townhouse': 'townhouses'
}

# Settlement type columns
settlement_cols = [
    'city', 'farm', 'island', 'town', 'village'
]

# Phrase variations for richer descriptions
PHRASE_VARIANTS = {
    'primary_landuse': [
        "This area is dominated by {name} ({pct:.0f}%)",
        "The landscape is primarily {name} ({pct:.0f}%)",
        "{name} areas ({pct:.0f}%) prevail here",
        "You'll find mostly {name} ({pct:.0f}%) in this zone"
    ],
    'secondary_landuse': [
        ", complemented by {names}",
        ", with pockets of {names}",
        ", alongside some {names}",
        ", interspersed with {names}"
    ],
    'building_footprint': [
        ". Buildings here occupy about {pct:.0f}% of the space",
        ". Buildings cover roughly {pct:.0f}% of the area",
        ". Buildings take up {pct:.0f}% of the land"
    ],
    'building_density': [
        ". Building density is {density} in this area",
        '. This area has a {density} building density'
    ],
    'primary_settlement': [
        "The area shown in the satellite image of {city_name} falls within a {type}",
        "This is a satellite image of {type} in {city_name}",
        "This is a satellite image of {city_name} where a {type} forms the core "
    ],
    'secondary_settlement': [
        ", with some {types} mixed in",
        ", alongside portions of {types}",
        ", blending into {types} areas",
        ", adjacent to {types} zones"
    ],
    'no_landuse': [
        "The terrain shows minimal developed land uses",
        "This plot appears largely undeveloped",
        "Natural features dominate this area",
        "Few man-made structures are evident here"
    ],
    'no_settlement': [
        "This is a satellite image of {city_name}"
        # "No formal settlements are designated here",
        # "The area lacks defined community boundaries",
        # "This zone appears unincorporated",
        # "No clear residential patterns emerge"
    ],
    'primary_building': [
        "The residential buildings are mainly {type}",
        "Housing consists primarily of {type}",
        "{type} structures dominate the residential areas",
        "You'll find mostly {type} here"
    ],
    # 'primary_building': [
    #     "The residential buildings are mainly {type} ({pct:.0f}%)",
    #     "Housing consists primarily of {type} ({pct:.0f}%)",
    #     "{type} structures ({pct:.0f}%) dominate the residential areas",
    #     "You'll find mostly {type} ({pct:.0f}%) here"
    # ],
    'secondary_building': [
        ", with some {types} interspersed",
        ", complemented by {types}",
        ", alongside {types} dwellings",
        ", mixed with {types} residences"
    ],
    'no_building': [
        "No significant residential buildings are present",
        "The area lacks defined residential structures",
        "Few dwellings are evident in this zone"
    ],
    'forest_position': [
        "The forest cluster is located in the {position} of the image as a shaded area",
        "A dense forest patch appears in the {position} of the image as a shaded area",
        "Forested areas concentrate in the {position} of the image as a shaded area"
    ],
    'landuse_position': [
        "The {landuse} area is concentrated in the {position} of the image in shaded {color}",
        "A {landuse} patch appears in the {position} region of the image in shaded {color}",
        "{landuse} areas cluster in the {position} portion of the image in shaded {color}",
        "The main {landuse} zone is located toward the {position} in shaded {color}"
    ],
    'general_shaded_area': [
        "The shaded {color} area represents the {landuse} in the satellite image",
        "Shaded {color} areas indicate {landuse} presence in this location",
        "The {landuse} appears in shaded {color} in the image"
    ]
}

# Add centroid columns mapping for each land use type
landuse_centroid_mapping = {
    'area_m2_forest': ('centroid_position_x_forest', 'centroid_position_y_forest'),
    'area_m2_commercial': ('centroid_position_x_commercial', 'centroid_position_y_commercial'),
    'area_m2_industrial': ('centroid_position_x_industrial', 'centroid_position_y_industrial'),
    'area_m2_residential': ('centroid_position_x_residential', 'centroid_position_y_residential'),
    'area_m2_recreational': ('centroid_position_x_recreational', 'centroid_position_y_recreational'),
    'area_m2_farmland': ('centroid_position_x_farmland', 'centroid_position_y_farmland'),
    'parking': ('centroid_position_x_parking', 'centroid_position_y_parking'),
    'water': ('centroid_position_x_water', 'centroid_position_y_water')
}

landuse_shading_color = {
    'area_m2_residential': 'orange',
    'area_m2_commercial': 'blue',
    'area_m2_forest': 'dark green',
    'area_m2_farmland': 'light green',
    'area_m2_industrial': 'purple'
}

# Contextual descriptions for specific combinations
CONTEXTUAL_DESCRIPTIONS = {
    ('industrial', 'city'): [
        "This urban industrial sector features manufacturing and warehouses",
        "Factory buildings dominate this city district",
        "An industrial park within the city limits"
    ],
    ('residential', 'city'): [
        "A bustling urban neighborhood with housing developments",
        "Residential blocks characterize this city area",
        "Residential buildings fill this urban zone"
    ],
    ('farmland', 'farm'): [
        "Agricultural fields stretch across this rural property",
        "Crop cultivation dominates this farming area",
        "This productive farmland shows organized planting patterns"
    ],
    ('forest', 'water'): [
        "A wooded area near water features creates a natural habitat",
        "Forest meets water body in this ecosystem",
        "Trees cluster near bodies of water in this preserve"
    ]
}


def get_significant_landuses(row, lb=0.1, ub=0.4):
    significant = []
    for col in shaded_area_cols:
        x_col, y_col = landuse_centroid_mapping[col]
        landuse_name = land_use_cols_names[col]
        if not pd.isna(row[col]) and row[col] >= lb and row[col] <= ub:
            # Check if we have concentrated version and centroid data
            concentrated_col = f"concentrated_{landuse_name}"
            if (concentrated_col in row and
                not pd.isna(row[concentrated_col]) and
                row[concentrated_col] > forest_concentration_threshold and
                not pd.isna(row[x_col]) and
                not pd.isna(row[y_col])):
                significant.append(col)
    return significant

def get_forest_position(x, y):
    """Determine relative position based on centroid coordinates"""
    if x < 0.33:
        horiz = "left"
    elif x < 0.66:
        horiz = "central"
    else:
        horiz = "right"

    if y < 0.33:
        vert = "lower"
    elif y < 0.66:
        vert = "mid"
    else:
        vert = "upper"

    # Combine descriptions
    if horiz == "central" and vert == "mid":
        return "center"
    return f"{vert} {horiz}"


for landuse_path in landuse_paths:
    city_name = ''
    if compute == 'SuperCloud':
        city_name = landuse_path.split("landuse_overlay/")[1].split('_')[0]
    elif compute == 'kxu_mac':
        city_name = landuse_path.split("LANDUSE_CALCS/")[1].split('_')[0]
    df = pd.read_csv(landuse_path)

    description_threshold = 0.04
    forest_concentration_threshold = 0.8
    tot_building_footprint_threshold = [(0.3, 'high'), (0.15, 'medium'), (0.03, 'low')]

    # Create new DataFrame to store expanded rows
    expanded_rows = []

    for _, row in df.iterrows():
        significant_landuses = get_significant_landuses(row)

        if not significant_landuses:
            # Keep original row if no significant land uses
            expanded_rows.append(row.to_dict())
        else:
            expanded_rows.append(row.to_dict())
            # Create one row per significant land use
            for landuse_col in significant_landuses:
                new_row = row.to_dict()

                # Add land use type marker
                new_row['primary_landuse'] = land_use_cols_names[landuse_col]
                new_row['primary_landuse_pct'] = row[landuse_col]

                # Get centroid position if available
                x_col, y_col = landuse_centroid_mapping.get(landuse_col, (None, None))
                if x_col and y_col and not pd.isna(row[x_col]) and not pd.isna(row[y_col]):
                    position = get_forest_position(row[x_col], row[y_col])
                    new_row['landuse_position'] = position

                expanded_rows.append(new_row)

    # Create new DataFrame
    expanded_df = pd.DataFrame(expanded_rows)

    def get_context_description(land_use, settlement):
        for (lu, st), descriptions in CONTEXTUAL_DESCRIPTIONS.items():
            if lu in land_use.lower() and st in settlement.lower():
                return random.choice(descriptions)
        return None

    def round_to_5(percentage):
        round_percentage = round(percentage / 0.05) * 5
        return round_percentage

    def describe_row(row):
        parts = []

        # --- Settlement Type Description ---
        settlements = []
        for col in settlement_cols:
            if not pd.isna(row[col]) and row[col] > 0:
                settlements.append((col, row[col]))

        if settlements:
            settlements_sorted = sorted(settlements, key=lambda x: -x[1])
            if settlements_sorted[0][1] > description_threshold:
                primary_type, primary_pct = settlements_sorted[0]

                # Check for contextual description first
                context_desc = None #get_context_description(parts[0], primary_type)
                if context_desc:
                    parts[0] = context_desc
                else:
                    # Standard settlement description
                    primary_phrase = random.choice(PHRASE_VARIANTS['primary_settlement'])
                    settle_desc = [primary_phrase.format(type=primary_type, city_name=city_name)]

                    # Add secondary settlements if significant
                    secondary = []
                    for st_type, st_pct in settlements_sorted[1:]:
                        if st_pct >= 0.35:
                            secondary.append(f"{st_type} ")

                    if secondary:
                        secondary_phrase = random.choice(PHRASE_VARIANTS['secondary_settlement'])
                        settle_desc.append(secondary_phrase.format(types=', '.join(secondary)))

                    parts.append(" ".join(settle_desc))
            else:
                parts.append(random.choice(PHRASE_VARIANTS['no_settlement']).format(city_name=city_name))
        else:
            parts.append(random.choice(PHRASE_VARIANTS['no_settlement']).format(city_name=city_name))

        # --- Land Use Description ---
        land_uses = []
        for col in land_use_cols:
            if not pd.isna(row[col]) and row[col] > 0:
                land_uses.append((land_use_cols_names[col], row[col]))

        if land_uses:
            land_uses_sorted = sorted(land_uses, key=lambda x: -x[1])

            if land_uses_sorted[0][1] > description_threshold:
                primary_name, primary_val = land_uses_sorted[0]

                # Start with primary land use
                primary_phrase = random.choice(PHRASE_VARIANTS['primary_landuse'])
                land_desc = [primary_phrase.format(name=primary_name, pct=round_to_5(primary_val))]

                # Add secondary land uses if significant
                secondary = []
                for name, val in land_uses_sorted[1:]:
                    if val >= description_threshold:
                        secondary.append(f"{name} ({round_to_5(val):.0f}%)")

                if secondary:
                    secondary_phrase = random.choice(PHRASE_VARIANTS['secondary_landuse'])
                    land_desc.append(secondary_phrase.format(names=', '.join(secondary)))

                # Add building footprint if available
                if not pd.isna(row.get('tot_building_footprint', 0)) and row['tot_building_footprint'] > 0:
                    tmp_density = ''
                    for idx, pair in enumerate(tot_building_footprint_threshold):
                        if row['building_density'] >= pair[0]:
                            tmp_density = pair[1]
                            break
                    if tmp_density != '':
                        footprint_phrase = random.choice(PHRASE_VARIANTS['building_density'])
                        land_desc.append(footprint_phrase.format(density=tmp_density))
                    # footprint_phrase = random.choice(PHRASE_VARIANTS['building_footprint'])
                    # land_desc.append(footprint_phrase.format(pct=round_to_5(row['tot_building_footprint'])))

                parts.append(" ".join(land_desc))
            else:
                parts.append(random.choice(PHRASE_VARIANTS['no_landuse']))
        else:
            parts.append(random.choice(PHRASE_VARIANTS['no_landuse']))

        # --- Land Use Position Descriptions (new section) ---
        for landuse_col in shaded_area_cols:
            # Skip if this land use doesn't have centroid data
            if landuse_col not in landuse_centroid_mapping:
                continue

            x_col, y_col = landuse_centroid_mapping[landuse_col]
            landuse_name = land_use_cols_names[landuse_col]

            if landuse_name != row['primary_landuse']:
                continue

            # Check if this land use is significant enough to describe
            if (not pd.isna(row[landuse_col])) and \
            (row[landuse_col] >= 0.1) and \
            (row[landuse_col] <= 0.4):

                # Check if we have concentrated version and centroid data
                concentrated_col = f"concentrated_{landuse_name}"
                if (concentrated_col in row and
                    not pd.isna(row[concentrated_col]) and
                    row[concentrated_col] > forest_concentration_threshold and
                    not pd.isna(row[x_col]) and
                    not pd.isna(row[y_col])):

                    position = get_forest_position(row[x_col], row[y_col])
                    position_phrase = random.choice(PHRASE_VARIANTS['landuse_position'])
                    parts.append(position_phrase.format(
                        landuse=landuse_name,
                        position=position,
                        color=landuse_shading_color[landuse_col]
                    ))
                else:
                    # Generic description if no centroid data
                    generic_phrase = random.choice(PHRASE_VARIANTS['general_shaded_area'])
                    parts.append(generic_phrase.format(landuse=landuse_name, color=landuse_shading_color[landuse_col]))
                    
        # --- Building Type Description --- (add this new section)
        buildings = []
        for col in building_type_cols:
            if not pd.isna(row[col]) and row[col] > 0:
                buildings.append((col, row[col]))

        if buildings and any(val > 0 for _, val in buildings):
            buildings_sorted = sorted(buildings, key=lambda x: -x[1])
            primary_bldg, primary_pct = buildings_sorted[0]

            # Only describe if residential area exists
            if not pd.isna(row['area_m2_residential']) and row['area_m2_residential'] > 0:
                primary_phrase = random.choice(PHRASE_VARIANTS['primary_building'])
                bldg_desc = [primary_phrase.format(
                    type=building_type_names[primary_bldg])]
                    # ,
                    # pct=round_to_5(primary_pct / row['tot_building_footprint']))]

                # Add secondary building types if significant
                secondary = []
                for bldg, pct in buildings_sorted[1:]:
                    tmp_pct = pct / row['tot_building_footprint']
                    if tmp_pct >= description_threshold:
                        secondary.append(f"{building_type_names[bldg]}")
                        # secondary.append(f"{building_type_names[bldg]} ({round_to_5(tmp_pct):.0f}%)")

                if secondary:
                    secondary_phrase = random.choice(PHRASE_VARIANTS['secondary_building'])
                    bldg_desc.append(secondary_phrase.format(types=', '.join(secondary)))

                parts.append(" ".join(bldg_desc))
        else:
            if not pd.isna(row['area_m2_residential']) and row['area_m2_residential'] > 0:
                pass #parts.append(random.choice(PHRASE_VARIANTS['no_building']))

        # --- Clean and filter empty parts ---
        parts = [p.strip() for p in parts if p and str(p).strip()]

        # --- Ensure each part ends with proper punctuation ---
        processed_parts = []
        for p in parts:
            p = p.strip()
            if not p.endswith(('.', '!', '?')):
                p += '.'
            processed_parts.append(p)

        # --- Randomly choose connection style ---
        if random.random() > 0.5 and len(processed_parts) > 1:
            # Combine as one sentence with connectors
            connectors = [
                " Additionally, ",
                " Furthermore, ",
                " Meanwhile, ",
                " In terms of settlement, "
            ]
            description = processed_parts[0]
            for part in processed_parts[1:]:
                description += random.choice(connectors) + part[0].lower() + part[1:]
        else:
            # Combine as separate sentences
            description = " ".join(processed_parts)

        # --- Final cleanup ---
        description = description.replace("..", ".")  # Fix double periods
        description = description.replace("a city", "the city")
        return description


    expanded_df['land_use_description'] = expanded_df.apply(describe_row, axis=1)
    expanded_df['city_name'] = city_name

    # Save to new CSV (optional)
    if compute == 'SuperCloud':
        filename = landuse_path.split("landuse_overlay/")[1].split('.csv')[0]
        output_path = prompt_path.format(filename=filename)
        expanded_df.to_csv(output_path, index=False)
        print(f"\nDataFrame with descriptions saved to: {output_path}")
    elif compute == 'kxu_mac':
        filename = landuse_path.split("LANDUSE_CALCS/")[1].split('.csv')[0]
        output_path = prompt_path.format(filename=filename)
        expanded_df.to_csv(output_path, index=False)
        print(f"\nDataFrame with descriptions saved to: {output_path}")


DataFrame with descriptions saved to: ./data/prompts/prompts_v2/la_0_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v2/dallas_0_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v2/dallas_5_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v2/la_3_7_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v2/la_7_3_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v2/la_5_0_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v2/dallas_3_7_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v2/la_5_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v2/dallas_7_3_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/prompts_v2/dallas_5_0_filtered_w_descri

# V1

In [None]:
import pandas as pd
import random
from collections import defaultdict
from glob import glob

import os
os.chdir("/home/gridsan/qwang/urban-control/")

landuse_paths = glob("./data/landuse_overlay/*_filtered.csv")
prompt_path = "./data/prompts/prompts_v1/{filename}_w_description.csv"

for landuse_path in landuse_paths:
    city_name = landuse_path.split("landuse_overlay/")[1].split('_')[0]
    df = pd.read_csv(landuse_path)

    description_threshold = 0.04
    tot_building_footprint_threshold = [(0.3, 'high'), (0.15, 'medium'), (0.03, 'low')]

    # Land use columns (excluding settlement types)
    land_use_cols = [
        'area_m2_commercial',
        'area_m2_farmland',
        'area_m2_forest',
        'area_m2_industrial',
        'parking',
        'area_m2_recreational',
        'area_m2_residential',
        'water'
    ]

    land_use_cols_names = {
        'area_m2_commercial': 'commercial',
        'area_m2_farmland': 'farmland',
        'area_m2_forest': 'forest',
        'area_m2_industrial': 'industrial',
        'parking': 'parking',
        'area_m2_recreational': 'recreational',
        'area_m2_residential': 'residential',
        'water': 'water'
    }

    building_type_cols = [
        'apartments',
        'house',
        # 'res_unkonwn',
        'townhouse'
    ]

    # Add building type names mapping
    building_type_names = {
        'apartments': 'apartment complexes',
        'house': 'single-family homes',
        # 'res_unkonwn': 'unclassified residences',  # Note: typo in column name?
        'townhouse': 'townhouses'
    }

    # Settlement type columns
    settlement_cols = [
        'city', 'farm', 'island', 'town', 'village'
    ]

    # Phrase variations for richer descriptions
    PHRASE_VARIANTS = {
        'primary_landuse': [
            "This area is dominated by {name} ({pct:.0f}%)",
            "The landscape is primarily {name} ({pct:.0f}%)",
            "{name} areas ({pct:.0f}%) prevail here",
            "You'll find mostly {name} ({pct:.0f}%) in this zone"
        ],
        'secondary_landuse': [
            ", complemented by {names}",
            ", with pockets of {names}",
            ", alongside some {names}",
            ", interspersed with {names}"
        ],
        'building_footprint': [
            ". Buildings here occupy about {pct:.0f}% of the space",
            ". Buildings cover roughly {pct:.0f}% of the area",
            ". Buildings take up {pct:.0f}% of the land"
        ],
        'building_density': [
            ". Building density is {density} in this area",
            '. This area has a {density} building density'
        ],
        'primary_settlement': [
            "The area shown in the satellite image of {city_name} falls within a {type}",
            "This is a satellite image of {type} in {city_name}",
            "This is a satellite image of {city_name} where a {type} forms the core "
        ],
        'secondary_settlement': [
            ", with some {types} mixed in",
            ", alongside portions of {types}",
            ", blending into {types} areas",
            ", adjacent to {types} zones"
        ],
        'no_landuse': [
            "The terrain shows minimal developed land uses",
            "This plot appears largely undeveloped",
            "Natural features dominate this area",
            "Few man-made structures are evident here"
        ],
        'no_settlement': [
            "This is a satellite image of {city_name}"
            # "No formal settlements are designated here",
            # "The area lacks defined community boundaries",
            # "This zone appears unincorporated",
            # "No clear residential patterns emerge"
        ],
        'primary_building': [
            "The residential buildings are mainly {type}",
            "Housing consists primarily of {type}",
            "{type} structures dominate the residential areas",
            "You'll find mostly {type} here"
        ],
        # 'primary_building': [
        #     "The residential buildings are mainly {type} ({pct:.0f}%)",
        #     "Housing consists primarily of {type} ({pct:.0f}%)",
        #     "{type} structures ({pct:.0f}%) dominate the residential areas",
        #     "You'll find mostly {type} ({pct:.0f}%) here"
        # ],
        'secondary_building': [
            ", with some {types} interspersed",
            ", complemented by {types}",
            ", alongside {types} dwellings",
            ", mixed with {types} residences"
        ],
        'no_building': [
            "No significant residential buildings are present",
            "The area lacks defined residential structures",
            "Few dwellings are evident in this zone"
        ]
    }

    # Contextual descriptions for specific combinations
    CONTEXTUAL_DESCRIPTIONS = {
        ('industrial', 'city'): [
            "This urban industrial sector features manufacturing and warehouses",
            "Factory buildings dominate this city district",
            "An industrial park within the city limits"
        ],
        ('residential', 'city'): [
            "A bustling urban neighborhood with housing developments",
            "Residential blocks characterize this city area",
            "Residential buildings fill this urban zone"
        ],
        ('farmland', 'farm'): [
            "Agricultural fields stretch across this rural property",
            "Crop cultivation dominates this farming area",
            "This productive farmland shows organized planting patterns"
        ],
        ('forest', 'water'): [
            "A wooded area near water features creates a natural habitat",
            "Forest meets water body in this ecosystem",
            "Trees cluster near bodies of water in this preserve"
        ]
    }

    def get_context_description(land_use, settlement):
        for (lu, st), descriptions in CONTEXTUAL_DESCRIPTIONS.items():
            if lu in land_use.lower() and st in settlement.lower():
                return random.choice(descriptions)
        return None

    def round_to_5(percentage):
        round_percentage = round(percentage / 0.05) * 5
        return round_percentage

    def describe_row(row):
        parts = []

        # --- Settlement Type Description ---
        settlements = []
        for col in settlement_cols:
            if not pd.isna(row[col]) and row[col] > 0:
                settlements.append((col, row[col]))

        if settlements:
            settlements_sorted = sorted(settlements, key=lambda x: -x[1])
            if settlements_sorted[0][1] > description_threshold:
                primary_type, primary_pct = settlements_sorted[0]

                # Check for contextual description first
                context_desc = None #get_context_description(parts[0], primary_type)
                if context_desc:
                    parts[0] = context_desc
                else:
                    # Standard settlement description
                    primary_phrase = random.choice(PHRASE_VARIANTS['primary_settlement'])
                    settle_desc = [primary_phrase.format(type=primary_type, city_name=city_name)]

                    # Add secondary settlements if significant
                    secondary = []
                    for st_type, st_pct in settlements_sorted[1:]:
                        if st_pct >= 0.35:
                            secondary.append(f"{st_type} ")

                    if secondary:
                        secondary_phrase = random.choice(PHRASE_VARIANTS['secondary_settlement'])
                        settle_desc.append(secondary_phrase.format(types=', '.join(secondary)))

                    parts.append(" ".join(settle_desc))
            else:
                parts.append(random.choice(PHRASE_VARIANTS['no_settlement']).format(city_name=city_name))
        else:
            parts.append(random.choice(PHRASE_VARIANTS['no_settlement']).format(city_name=city_name))

        # --- Land Use Description ---
        land_uses = []
        for col in land_use_cols:
            if not pd.isna(row[col]) and row[col] > 0:
                land_uses.append((land_use_cols_names[col], row[col]))

        if land_uses:
            land_uses_sorted = sorted(land_uses, key=lambda x: -x[1])

            if land_uses_sorted[0][1] > description_threshold:
                primary_name, primary_val = land_uses_sorted[0]

                # Start with primary land use
                primary_phrase = random.choice(PHRASE_VARIANTS['primary_landuse'])
                land_desc = [primary_phrase.format(name=primary_name, pct=round_to_5(primary_val))]

                # Add secondary land uses if significant
                secondary = []
                for name, val in land_uses_sorted[1:]:
                    if val >= description_threshold:
                        secondary.append(f"{name} ({round_to_5(val):.0f}%)")

                if secondary:
                    secondary_phrase = random.choice(PHRASE_VARIANTS['secondary_landuse'])
                    land_desc.append(secondary_phrase.format(names=', '.join(secondary)))

                # Add building footprint if available
                if not pd.isna(row.get('tot_building_footprint', 0)) and row['tot_building_footprint'] > 0:
                    tmp_density = ''
                    for idx, pair in enumerate(tot_building_footprint_threshold):
                        if row['building_density'] >= pair[0]:
                            tmp_density = pair[1]
                            break
                    if tmp_density != '':
                        footprint_phrase = random.choice(PHRASE_VARIANTS['building_density'])
                        land_desc.append(footprint_phrase.format(density=tmp_density))
                    # footprint_phrase = random.choice(PHRASE_VARIANTS['building_footprint'])
                    # land_desc.append(footprint_phrase.format(pct=round_to_5(row['tot_building_footprint'])))

                parts.append(" ".join(land_desc))
            else:
                parts.append(random.choice(PHRASE_VARIANTS['no_landuse']))
        else:
            parts.append(random.choice(PHRASE_VARIANTS['no_landuse']))

        # --- Building Type Description --- (add this new section)
        buildings = []
        for col in building_type_cols:
            if not pd.isna(row[col]) and row[col] > 0:
                buildings.append((col, row[col]))

        if buildings and any(val > 0 for _, val in buildings):
            buildings_sorted = sorted(buildings, key=lambda x: -x[1])
            primary_bldg, primary_pct = buildings_sorted[0]

            # Only describe if residential area exists
            if not pd.isna(row['area_m2_residential']) and row['area_m2_residential'] > 0:
                primary_phrase = random.choice(PHRASE_VARIANTS['primary_building'])
                bldg_desc = [primary_phrase.format(
                    type=building_type_names[primary_bldg])]
                    # ,
                    # pct=round_to_5(primary_pct / row['tot_building_footprint']))]

                # Add secondary building types if significant
                secondary = []
                for bldg, pct in buildings_sorted[1:]:
                    tmp_pct = pct / row['tot_building_footprint']
                    if tmp_pct >= description_threshold:
                        secondary.append(f"{building_type_names[bldg]}")
                        # secondary.append(f"{building_type_names[bldg]} ({round_to_5(tmp_pct):.0f}%)")

                if secondary:
                    secondary_phrase = random.choice(PHRASE_VARIANTS['secondary_building'])
                    bldg_desc.append(secondary_phrase.format(types=', '.join(secondary)))

                parts.append(" ".join(bldg_desc))
        else:
            if not pd.isna(row['area_m2_residential']) and row['area_m2_residential'] > 0:
                pass #parts.append(random.choice(PHRASE_VARIANTS['no_building']))

        # --- Clean and filter empty parts ---
        parts = [p.strip() for p in parts if p and str(p).strip()]

        # --- Ensure each part ends with proper punctuation ---
        processed_parts = []
        for p in parts:
            p = p.strip()
            if not p.endswith(('.', '!', '?')):
                p += '.'
            processed_parts.append(p)

        # --- Randomly choose connection style ---
        if random.random() > 0.5 and len(processed_parts) > 1:
            # Combine as one sentence with connectors
            connectors = [
                " Additionally, ",
                " Furthermore, ",
                " Meanwhile, ",
                " In terms of settlement, "
            ]
            description = processed_parts[0]
            for part in processed_parts[1:]:
                description += random.choice(connectors) + part[0].lower() + part[1:]
        else:
            # Combine as separate sentences
            description = " ".join(processed_parts)

        # --- Final cleanup ---
        description = description.replace("..", ".")  # Fix double periods
        description = description.replace("a city", "the city")
        return description


    df['land_use_description'] = df.apply(describe_row, axis=1)
    df['city_name'] = city_name

    # Save to new CSV (optional)
    filename = landuse_path.split("landuse_overlay/")[1].split('.csv')[0]
    output_path = prompt_path.format(filename=filename)
    df.to_csv(output_path, index=False)
    print(f"\nDataFrame with descriptions saved to: {output_path}")


DataFrame with descriptions saved to: ./data/prompts/la_0_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/dallas_0_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/dallas_5_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/la_3_7_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/la_7_3_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/la_5_0_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/dallas_3_7_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/la_5_5_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/dallas_7_3_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/dallas_5_0_filtered_w_description.csv

DataFrame with descriptions saved to: ./data/prompts/dallas_0_0_filtered_w_description.csv

DataFra