In [None]:
import sqlalchemy as sa

# Create the connection string with placeholders for credentials
db_uri = "postgresql+psycopg2://<USERNAME>:<PASSWORD>@localhost:5433/layereddb"

# Create the Engine object, keeping pool_pre_ping for reliability
engine = sa.create_engine(db_uri, pool_pre_ping=True)

print("✅ Connection engine for the local 'layereddb' is ready.")

✅ Connection engine for the local 'layereddb' is ready.


In [88]:
from sqlalchemy import inspect
# Create an inspector object from engine
inspector = inspect(engine)

# Get the list of schema names
schemas = inspector.get_schema_names()

print("Available schemas in the database:")
print(schemas)

Available schemas in the database:
['berlin_labels', 'berlin_recommender', 'berlin_source_data', 'dashboard_data', 'information_schema', 'public']


We are working in 'berlin_source_data' and 'berlin_labels'

In [89]:
# Get the list of table names for the 'berlin_source_data' schema
tables_in_schema = inspector.get_table_names(schema='berlin_source_data')

# Print the list of tables
print("\nTables in schema 'berlin_source_data':")
print(tables_in_schema)


Tables in schema 'berlin_source_data':
['theaters', 'pools_refactored', 'theaters_backup_neigh_final', 'night_clubs', 'district_level_aggregated', 'bus_tram_stops', 'malls', 'banks', 'doctors', 'social_clubs_activities', 'veterinary_clinics_martin_svitek', 'hospitals_refactored', 'pharmacies', 'supermarkets', 'bike_lanes', 'pools', 'hospitals', 'land_prices', 'test_table_george_smelin', 'venues', 'gyms', 'universities', 'dental_offices', 'post_offices', 'kindergartens', 'sbahn', 'schools', 'short_term_listings', 'districts', 'ubahn', 'long_term_listings', 'veterinary_clinics', 'milieuschutz_protection_zones', 'neighborhoods', 'parks', 'regional_statistics', 'crime_statistics', 'districts_pop_stat', 'playgrounds', 'rent_stats_per_neighborhood']


In [90]:
# Get the list of table names for the 'berlin_labels' schema
tables_in_schema = inspector.get_table_names(schema='berlin_labels')

# Print the list of tables
print("\nTables in schema 'berlin_labels':")
print(tables_in_schema)


Tables in schema 'berlin_labels':
['district_attributes', 'district_labels_new', 'district_features', 'district_labels', 'neighborhood_labels']


Since the 'district_features' table contains all the necessary counts for my labels and the 'district_attributes' table contains the calculated scaling coefficients, I will use these two tables for the labeling process.

In [91]:
import pandas as pd
pd.set_option('display.max_columns', None)
# Full table name, including the schema
table_name = 'berlin_labels.district_features'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_features = pd.read_sql(query, engine)

print(district_features.head())

  district_id  bus_tram_stop_count  uban_station_count  sbahn_station_count  \
0    11001001                  222                  32                   45   
1    11002002                  120                  15                   20   
2    11003003                  292                   3                   37   
3    11004004                  264                  23                   37   
4    11005005                  283                   5                    3   

   bank_count  post_office_count  supermarket_count  mall_count  \
0          48                 40                146          13   
1          21                 28                107           4   
2          26                 34                161           8   
3          47                 38                129           4   
4          19                  1                 77           5   

   num_sport_clubs  num_gyms  num_pools  hospital_count  pharmacy_count  \
0               15        63          9        

In [92]:
# Full table name, including the schema
table_name = 'berlin_labels.district_attributes'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_attributes = pd.read_sql(query, engine)

print(district_attributes.head())

  district_id  area_sq_km  inhabitants  area_coefficient  \
0    11004004   64.662978       343081          0.871208   
1    11002002   20.389118       293454          0.274704   
2    11011011   52.091363       311881          0.701830   
3    11010010   61.782422       291948          0.832398   
4    11001001   39.379173       397134          0.530558   

   population_coefficient  
0                1.061595  
1                0.908034  
2                0.965053  
3                0.903374  
4                1.228851  


In [93]:
avg_hospitals=district_features['hospital_count'].mean()
avg_pharmacys=district_features['pharmacy_count'].mean()
avg_dental_offices=district_features['dental_office_count'].mean()
avg_primary_adult=district_features['total_primary_adult_score'].mean()
avg_pediatric=district_features['total_pediatric_score'].mean()
avg_specialist=district_features['specialist_score_total'].mean()
  
print(avg_hospitals, avg_pharmacys, avg_dental_offices, avg_primary_adult, avg_pediatric, avg_specialist)

20.916666666666668 56.25 65.08333333333333 53.025 11.725 149.52499999999998


I would like to add the label #sunday_pharmacy_access for districts that have at least one pharmacy regularly open on Sundays (excluding the emergency pharmacies that rotate weekly). To do this, I need to filter the pharmacies table.

In [94]:
import pandas as pd
from sqlalchemy import create_engine, text

# LOAD THE 'PHARMACIES' DATAFRAME FROM THE DATABASE

PHARMACY_TABLE = 'berlin_source_data.pharmacies' 
query = f"SELECT pharmacy_id, district_id, openinghours FROM {PHARMACY_TABLE}"

try:
    pharmacies = pd.read_sql(query, engine) 
    print(f"Successfully loaded {len(pharmacies)} pharmacy records.")
except NameError:
    print("Error: 'engine' object is not defined. Please initialize your database connection.")
except Exception as e:
    print(f"An error occurred during database query: {e}")
    
# 1. Create the NEW regex pattern
pattern = r'(?:\b(?:su|so)\s*\d)|(?:[a-z-]{2,}-\s*(?:su|so)\b\s*\d)'


# 2. Apply the filter (case=False is mandatory)
# Check if the dataframe was successfully loaded and the column exists
if 'openinghours' in pharmacies.columns:
    mask = pharmacies['openinghours'].str.contains(pattern, case=False, na=False)

    # 3. Get the DataFrame of pharmacies open on Sunday
    sunday_pharmacies = pharmacies[mask].copy() # Use .copy() to prevent SettingWithCopyWarning

    # 4. Print only the necessary columns
    columns_to_show = ['pharmacy_id', 'district_id', 'openinghours']

    if all(col in sunday_pharmacies.columns for col in columns_to_show):
        print("\nPharmacies open on Sunday (filtered):")
        print(sunday_pharmacies[columns_to_show].head(10))
    else:
        print(f"Error: Columns {columns_to_show} not found in the filtered result.")
        print("Result with all columns:")
        print(sunday_pharmacies)
else:
    print("\nAborting filter: 'pharmacies' DataFrame was not loaded correctly or 'openinghours' column is missing.")

Successfully loaded 675 pharmacy records.

Pharmacies open on Sunday (filtered):
    pharmacy_id district_id                                       openinghours
151   419545768    11001001                  mo-sa 08:30-22:30; su 10:00-22:30
259   638953355    11011011  mo-fr 08:00-19:00; sa 08:00-13:00; su 08:00-16:00
396  1552051348    11001001                                  mo-su 08:00-24:00
440  2098147261    11006006                  mo-sa 08:30-19:00; su 08:30-14:00
515  3350477701    11001001  mo-fr 08:00-20:00; su 10:00-16:00; sa 09:00-19:00
604  6006371223    11003003  mo-fr 08:30-19:00; sa 08:30-20:00; su 10:00-20:00
663    59806041    11001001                                  mo-su 07:00-21:00
666   337165575    11002002                  mo-sa 08:00-20:00; su 10:00-18:00
668   381240714    11007007  mo-fr 07:00-20:00; sa 09:00-19:00; su 10:00-18:00


In [95]:
sunday_pharmacies_count=sunday_pharmacies.groupby('district_id')['pharmacy_id'].nunique()
print(sunday_pharmacies_count)

district_id
11001001    4
11002002    1
11003003    1
11006006    1
11007007    1
11011011    1
Name: pharmacy_id, dtype: int64


In [96]:
# Merge the DataFrames to create analysis_df
analysis_df = pd.merge(district_features, district_attributes, on='district_id')

print("analysis_df created successfully by merging the tables.")

analysis_df created successfully by merging the tables.


In [97]:
# Ensure district_id is a column, not an index, in analysis_df
if analysis_df.index.name == 'district_id':
    analysis_df = analysis_df.reset_index()

# Merge the count of Sunday pharmacies. Rename the merged column and fill NaNs with 0.
analysis_df = pd.merge(
    analysis_df, 
    sunday_pharmacies_count.rename('sunday_pharmacy_count'), 
    on='district_id', 
    how='left'
).fillna({'sunday_pharmacy_count': 0}) # Fill only the new count column


#  Assign the final boolean tag directly to the main DataFrame
analysis_df['#sunday_pharmacy_access'] = analysis_df['sunday_pharmacy_count'] > 0


display_df = analysis_df[analysis_df['#sunday_pharmacy_access']].copy()

if not display_df.empty:
    display_df['sunday_pharmacy_tag'] = '#sunday_pharmacy_access'
    print("\n--- Verification of #sunday_pharmacy_access Tag Assignment ---")
    print(
        display_df[['district_id', 'sunday_pharmacy_count', 'sunday_pharmacy_tag']]
        .assign(sunday_pharmacy_count=lambda x: x['sunday_pharmacy_count'].astype(int))
    )
else:
    print("\nNo districts received the #sunday_pharmacy_access tag.")


# Cleanup (Removing the intermediate count column)

analysis_df.drop(columns=['sunday_pharmacy_count'], inplace=True)

print("\n Tag #sunday_pharmacy_access created and integrated as a boolean column.")
print(f"Total districts tagged: {analysis_df['#sunday_pharmacy_access'].sum()}")


--- Verification of #sunday_pharmacy_access Tag Assignment ---
   district_id  sunday_pharmacy_count      sunday_pharmacy_tag
0     11001001                      4  #sunday_pharmacy_access
1     11002002                      1  #sunday_pharmacy_access
2     11003003                      1  #sunday_pharmacy_access
5     11006006                      1  #sunday_pharmacy_access
6     11007007                      1  #sunday_pharmacy_access
10    11011011                      1  #sunday_pharmacy_access

 Tag #sunday_pharmacy_access created and integrated as a boolean column.
Total districts tagged: 6


The independent access tag (#sunday_pharmacy_access) has been successfully assigned. This final block executes the multi-layered tagging logic for the "Amenities & Services" category.

It computes all composite tags (Tier 2 and Tier 3), applies the full hierarchical cleanup, and transforms the resulting boolean assignments into the long (district_id, tag_name) format required for bulk insertion into the `berlin_labels.district_labels_new` table.

In [98]:
# PARAMETERS DEFINITION

POP_COEFF_COLUMN = 'population_coefficient' 
AREA_COEFF_COLUMN = 'area_coefficient'
HOSPITAL_COUNT_COLUMN = 'hospital_count'
PHARMACY_COUNT_COLUMN = 'pharmacy_count'
DENTAL_COUNT_COLUMN = 'dental_office_count'


# APPLY LOGIC FOR ALL INDIVIDUAL (BASE) TAGS 

# Score-Based Tags (using POPULATION Coefficient)
analysis_df['#strong_primary_adult_care'] = (
    analysis_df['total_primary_adult_score'] > 
    (avg_primary_adult * analysis_df[POP_COEFF_COLUMN]) 
)
analysis_df['#strong_pediatric_care'] = (
    analysis_df['total_pediatric_score'] > 
    (avg_pediatric * analysis_df[POP_COEFF_COLUMN]) 
)
analysis_df['#specialist_hub'] = (
    analysis_df['specialist_score_total'] > 
    (avg_specialist * analysis_df[POP_COEFF_COLUMN]) 
)

# Count-Based Tags (using AREA or POPULATION Coefficient)
analysis_df['#high_hospital_density'] = (
    analysis_df[HOSPITAL_COUNT_COLUMN] > 
    (avg_hospitals * analysis_df[AREA_COEFF_COLUMN]) 
)
analysis_df['#many_pharmacies'] = (
    analysis_df[PHARMACY_COUNT_COLUMN] > 
    (avg_pharmacys * analysis_df[POP_COEFF_COLUMN]) 
)
analysis_df['#many_dental_clinics'] = (
    analysis_df[DENTAL_COUNT_COLUMN] > 
    (avg_dental_offices * analysis_df[POP_COEFF_COLUMN]) 
)

# CREATE COMPOSITE TAGS AND APPLY HIERARCHY

# Composite 1: Core Primary Care Hub (Tier 2)
# Replaces #strong_primary_adult_care AND #strong_pediatric_care
analysis_df['#core_primary_care_hub'] = (
    analysis_df['#strong_primary_adult_care'] & 
    analysis_df['#strong_pediatric_care']
)


# Composite 2: Full Spectrum Healthcare (Tier 3 - TOP TIER)
# Replaces ALL Tier 1 & 2 components below it
analysis_df['#full_spectrum_healthcare'] = (
    analysis_df['#core_primary_care_hub'] & 
    analysis_df['#specialist_hub'] & 
    analysis_df['#many_pharmacies'] & 
    analysis_df['#many_dental_clinics'] & 
    analysis_df['#high_hospital_density']
)


# APPLY HIERARCHICAL CLEANUP (REPLACEMENT)

# Tags that are suppressed by the higher-level tags:
tags_to_suppress_by_core_hub = ['#strong_primary_adult_care', '#strong_pediatric_care']

tags_to_suppress_by_full_spectrum = [
    '#core_primary_care_hub', 
    '#specialist_hub', 
    '#many_pharmacies', 
    '#many_dental_clinics', 
    '#high_hospital_density'
]

# Replacement 1: If #core_primary_care_hub is TRUE, suppress its two components
analysis_df.loc[analysis_df['#core_primary_care_hub'], tags_to_suppress_by_core_hub] = False

# Replacement 2: If #full_spectrum_healthcare is TRUE, suppress ALL lower tags (Tier 1 & Tier 2)
analysis_df.loc[analysis_df['#full_spectrum_healthcare'], tags_to_suppress_by_full_spectrum] = False


# FINAL VERIFICATION & ETL TRANSFORMATION

all_tags = tags_to_suppress_by_full_spectrum + tags_to_suppress_by_core_hub + ['#full_spectrum_healthcare', '#sunday_pharmacy_access']
# Remove duplicates caused by merging lists
final_tags_list = sorted(list(set(all_tags))) 

print("\n FINAL Amenities & Services Tag Assignment Complete ")
print(f"Total districts with Full Spectrum tag: {analysis_df['#full_spectrum_healthcare'].sum()}")


# ETL FINAL STEP: TRANSFORM TO LONG FORMAT (FOR DB UPLOAD) 

# Select only the district_id and the final list of boolean tag columns
df_tags_only = analysis_df[['district_id'] + final_tags_list].copy()

# Convert the boolean columns into a "long" format (melt)
df_long = df_tags_only.melt(
    id_vars=['district_id'],
    value_vars=final_tags_list,
    var_name='label',          
    value_name='is_assigned'   
)

# Filter for only the assigned tags (where value is True)
df_final_upload = df_long[df_long['is_assigned'] == True].copy()

# Cleanup for upload: Select only the district_id and the tag name (label)
df_final_upload = df_final_upload[['district_id', 'label']]
df_final_upload['category'] = 'Amenities & Services' # Add the category column

print("\n--- FINAL LIST OF ASSIGNED TAGS FOR DB UPLOAD (district_id, tag_name) ---")
print(df_final_upload)



 FINAL Amenities & Services Tag Assignment Complete 
Total districts with Full Spectrum tag: 2

--- FINAL LIST OF ASSIGNED TAGS FOR DB UPLOAD (district_id, tag_name) ---
    district_id                       label              category
2      11003003      #core_primary_care_hub  Amenities & Services
6      11007007      #core_primary_care_hub  Amenities & Services
7      11008008      #core_primary_care_hub  Amenities & Services
15     11004004   #full_spectrum_healthcare  Amenities & Services
17     11006006   #full_spectrum_healthcare  Amenities & Services
24     11001001      #high_hospital_density  Amenities & Services
25     11002002      #high_hospital_density  Amenities & Services
33     11010010      #high_hospital_density  Amenities & Services
34     11011011      #high_hospital_density  Amenities & Services
36     11001001        #many_dental_clinics  Amenities & Services
38     11003003        #many_dental_clinics  Amenities & Services
42     11007007        #many_dental_c

In [99]:
# PARAMETERS FOR DB UPLOAD 
TARGET_TABLE = 'district_labels_new'
TARGET_SCHEMA = 'berlin_labels'

try:
    # Check if the 'engine' object is available
    if 'engine' not in locals() and 'engine' not in globals():
        raise NameError("The 'engine' object for database connection is not defined.")

    # Upload the data
    # We use if_exists='append' to add the new tags without deleting existing data
    df_final_upload.to_sql(
        name=TARGET_TABLE,
        con=engine,
        schema=TARGET_SCHEMA,
        if_exists='append',
        index=False  # Do not upload the DataFrame index as a column
    )

    print(f" Success! {len(df_final_upload)} tags successfully uploaded to the database.")

except NameError as e:
    print(f"Fatal Error: {e}. Please ensure your SQLAlchemy 'engine' object is initialized.")
except Exception as e:
    print(f"An error occurred during database upload: {e}")

 Success! 25 tags successfully uploaded to the database.
