In [11]:
import os
import geopandas as gpd
from shapely.geometry import Point

In [1]:
try:
    gdf_districts = gpd.read_file("limites_adm_geoportal_distrito_municipal_v2.geojson")
    gdf_vulnerability = gpd.read_file("geoportal_setor_censitario_vuln.geojson")
except Exception as e:
    print(f"Error loading files: {e}")
    exit()

gdf_districts_wgs84 = gdf_districts.to_crs(epsg=4326)
gdf_vulnerability_wgs84 = gdf_vulnerability.to_crs(epsg=4326)

districts_subset = gdf_districts_wgs84[['nm_distrito_municipal', 'geometry']]

merged_gdf = gpd.sjoin(gdf_vulnerability_wgs84, districts_subset, how="left", predicate="within")
merged_gdf = merged_gdf.rename(columns={'nm_distrito_municipal': 'distrito'})

# Drop the extra index column created by the join
merged_gdf = merged_gdf.drop(columns=['index_right'])

# Keep only the rows where the 'distrito' column is not NaN
gdf_filtered = merged_gdf.dropna(subset=['distrito'])

print("\nSuccessfully added the 'distrito' column and filtered out non-matching rows.")
print("\nColumns of the final GeoDataFrame:")
print(gdf_filtered.columns)

print("\n\nPreview of the first 5 rows of the final data:")
print(gdf_filtered[['cd_identificador', 'qt_populacao', 'distrito', 'geometry']].head())

output_filename = "vulnerability_with_district_name_filtered.geojson"
try:
    gdf_filtered.to_file(output_filename, driver='GeoJSON')
    print(f"\n\nSuccessfully saved the filtered result to '{output_filename}'")
except Exception as e:
    print(f"\n\nAn error occurred while saving the file: {e}")


Successfully added the 'distrito' column and filtered out non-matching rows.

Columns of the final GeoDataFrame:
Index(['id', 'cd_identificador', 'cd_identificador_domicilio_censitario',
       'cd_identificador_densidade_demografica',
       'cd_identificador_vulnerabilidade_social',
       'cd_identificador_georreferenciamento', 'qt_area_setor_censitario',
       'cd_original_setor_censitario', 'an_censo_demografico',
       'cd_setor_censitario_domicilio', 'qt_domicilio', 'an_censo_domicilio',
       'cd_setor_censitario_vulnerabilidade',
       'cd_indice_vulnerabilidade_social', 'an_vulnerabilidade_social',
       'cd_setor_censitario_densidade', 'qt_populacao', 'qt_area_hectare',
       'qt_habitante_hectare', 'an_densidade_demografica', 'geometry',
       'distrito'],
      dtype='object')


Preview of the first 5 rows of the final data:
  cd_identificador  qt_populacao        distrito  \
2            32355         573.0  RAPOSO TAVARES   
3            32356         156.0  RAPO

In [2]:
gdf_filtered.head(5)

Unnamed: 0,id,cd_identificador,cd_identificador_domicilio_censitario,cd_identificador_densidade_demografica,cd_identificador_vulnerabilidade_social,cd_identificador_georreferenciamento,qt_area_setor_censitario,cd_original_setor_censitario,an_censo_demografico,cd_setor_censitario_domicilio,...,cd_setor_censitario_vulnerabilidade,cd_indice_vulnerabilidade_social,an_vulnerabilidade_social,cd_setor_censitario_densidade,qt_populacao,qt_area_hectare,qt_habitante_hectare,an_densidade_demografica,geometry,distrito
2,setor_censitario.32355,32355,12415.0,6171,12415.0,,1004.4755,355030865000149,2010,355030865000149,...,355030865000149,4.0,2010.0,355030865000149,573.0,0.100683,5691.134318,2010,"POLYGON ((-46.78918 -23.59919, -46.78955 -23.5...",RAPOSO TAVARES
3,setor_censitario.32356,32356,12414.0,6172,12414.0,,2260.1768,355030865000148,2010,355030865000148,...,355030865000148,0.0,2010.0,355030865000148,156.0,0.226534,688.63773,2010,"POLYGON ((-46.78955 -23.59906, -46.78969 -23.5...",RAPOSO TAVARES
5,setor_censitario.24961,24961,12197.0,6265,12197.0,,29553.1866,355030864000088,2010,355030864000088,...,355030864000088,2.0,2010.0,355030864000088,306.0,2.962862,103.278515,2010,"POLYGON ((-46.50763 -23.51855, -46.50827 -23.5...",PONTE RASA
6,setor_censitario.24962,24962,12198.0,6266,12198.0,,50021.4491,355030864000089,2010,355030864000089,...,355030864000089,2.0,2010.0,355030864000089,391.0,5.014908,77.967529,2010,"POLYGON ((-46.50789 -23.51704, -46.50764 -23.5...",PONTE RASA
7,setor_censitario.24963,24963,12179.0,6267,12179.0,,40616.2685,355030864000070,2010,355030864000070,...,355030864000070,2.0,2010.0,355030864000070,432.0,4.071997,106.090455,2010,"POLYGON ((-46.50624 -23.51439, -46.50615 -23.5...",PONTE RASA


In [3]:
gdf_filtered['distrito'].value_counts()

distrito
GRAJAU             655
JABAQUARA          370
CAPAO REDONDO      367
JARDIM ANGELA      361
JARDIM SAO LUIS    359
                  ... 
CAMBUCI             36
BOM RETIRO          34
BARRA FUNDA         21
JAGUARA             18
PARI                17
Name: count, Length: 96, dtype: int64

In [12]:
def get_district_from_coords(longitude, latitude, gdf):
    """
    Finds the district in São Paulo that contains the given coordinates.

    Args:
        longitude (float): The longitude of the point.
        latitude (float): The latitude of the point.
        gdf (GeoDataFrame): A GeoDataFrame containing the district geometries.

    Returns:
        str: The name of the district containing the point, or a message if not found.
    """
    if gdf.empty:
        return 

    point = Point(longitude, latitude)

    for index, row in gdf.iterrows():
        if row['geometry'].contains(point):
            return row

In [13]:
example_latitude, example_longitude = -23.53028637943643, -46.612411709376424

district = get_district_from_coords(example_longitude, example_latitude, gdf_filtered)

if district is not None:
    print(f"The point {example_longitude}, {example_latitude} is: {district['distrito']} - vuln: {district['cd_indice_vulnerabilidade_social']}")

The point -46.612411709376424, -23.53028637943643 is: PARI - vuln: 2.0


### How many samples per district?

In [14]:
gdf_filtered['distrito'].value_counts()

distrito
GRAJAU             655
JABAQUARA          370
CAPAO REDONDO      367
JARDIM ANGELA      361
JARDIM SAO LUIS    359
                  ... 
CAMBUCI             36
BOM RETIRO          34
BARRA FUNDA         21
JAGUARA             18
PARI                17
Name: count, Length: 96, dtype: int64

In [15]:
(gdf_filtered['distrito'].value_counts() > 100).sum()

np.int64(80)

### Stratified Sampling

In [None]:
output_dir = "samples"
os.makedirs(output_dir, exist_ok=True)
print(f"Created directory: '{output_dir}'")

# Get the unique distrito names
distritos = gdf_filtered['distrito'].unique()

total_samples_collected = 0

for distrito_name in distritos:
    print(f"\nProcessing distrito: {distrito_name}")

    # Filter the GeoDataFrame for the current distrito
    distrito_gdf = gdf_filtered[gdf_filtered['distrito'] == distrito_name]

    # Determine the number of samples to take
    n_samples = min(100, len(distrito_gdf))

    # Perform the random sampling
    sampled_gdf = distrito_gdf.sample(n=n_samples, random_state=42) 

    print(f"  - Total rows: {len(distrito_gdf)}")
    print(f"  - Sampled rows: {len(sampled_gdf)}")
    total_samples_collected += len(sampled_gdf)

    # Define the output path for the sample
    sample_output_filename = f"{output_dir}/{distrito_name}.geojson"

    # Save the sampled GeoDataFrame to a new GeoJSON file
    try:
        sampled_gdf.to_file(sample_output_filename, driver='GeoJSON')
        print(f"  - Successfully saved sample to '{sample_output_filename}'")
    except Exception as e:
        print(f"  - An error occurred while saving the sample file: {e}")

print("\n--- Random sampling process completed ---")
print(f'Total samples = {total_samples_collected}')

Created directory: 'samples'

Processing distrito: RAPOSO TAVARES
  - Total rows: 194
  - Sampled rows: 100
  - Successfully saved sample to 'samples/RAPOSO TAVARES.geojson'

Processing distrito: PONTE RASA
  - Total rows: 142
  - Sampled rows: 100
  - Successfully saved sample to 'samples/PONTE RASA.geojson'

Processing distrito: JARDIM ANGELA
  - Total rows: 361
  - Sampled rows: 100
  - Successfully saved sample to 'samples/JARDIM ANGELA.geojson'

Processing distrito: ERMELINO MATARAZZO
  - Total rows: 156
  - Sampled rows: 100
  - Successfully saved sample to 'samples/ERMELINO MATARAZZO.geojson'

Processing distrito: VILA JACUI
  - Total rows: 208
  - Sampled rows: 100
  - Successfully saved sample to 'samples/VILA JACUI.geojson'

Processing distrito: CIDADE TIRADENTES
  - Total rows: 302
  - Sampled rows: 100
  - Successfully saved sample to 'samples/CIDADE TIRADENTES.geojson'

Processing distrito: ITAQUERA
  - Total rows: 235
  - Sampled rows: 100
  - Successfully saved sample to