In [9]:
import sqlalchemy as sa

# Create the connection string for your local database with the new password
db_uri = "postgresql+psycopg2://yana_yelnikova:jPh9p8k6nzjRDe82@localhost:5433/layereddb"

# Create the Engine object, keeping pool_pre_ping for reliability
engine = sa.create_engine(db_uri, pool_pre_ping=True)

print("✅ Connection engine for the local 'layereddb' is ready.")

✅ Connection engine for the local 'layereddb' is ready.


In [10]:
from sqlalchemy import inspect
# Create an inspector object from engine
inspector = inspect(engine)

# Get the list of schema names
schemas = inspector.get_schema_names()

print("Available schemas in the database:")
print(schemas)

Available schemas in the database:
['berlin_labels', 'berlin_recommender', 'berlin_source_data', 'dashboard_data', 'information_schema', 'public']


We are working in 'berlin_source_data' and 'berlin_labels'

In [11]:
# Get the list of table names for the 'berlin_source_data' schema
tables_in_schema = inspector.get_table_names(schema='berlin_source_data')

# Print the list of tables
print("\nTables in schema 'berlin_source_data':")
print(tables_in_schema)


Tables in schema 'berlin_source_data':
['theaters', 'pools_refactored', 'district_level_aggregated', 'district_attributes_test', 'bus_tram_stops', 'malls', 'banks', 'social_clubs_activities', 'veterinary_clinics_martin_svitek', 'hospitals_refactored', 'pharmacies', 'supermarkets', 'bike_lanes', 'pools', 'hospitals', 'land_prices', 'test_table_george_smelin', 'gyms', 'universities', 'venues', 'dental_offices', 'post_offices', 'kindergartens', 'sbahn', 'schools', 'short_term_listings', 'districts', 'ubahn', 'long_term_listings', 'veterinary_clinics', 'milieuschutz_protection_zones', 'neighborhoods', 'parks', 'regional_statistics', 'crime_statistics', 'districts_pop_stat', 'playgrounds', 'rent_stats_per_neighborhood']


In [12]:
# Get the list of table names for the 'berlin_labels' schema
tables_in_schema = inspector.get_table_names(schema='berlin_labels')

# Print the list of tables
print("\nTables in schema 'berlin_labels':")
print(tables_in_schema)


Tables in schema 'berlin_labels':
['district_features', 'district_attributes', 'district_labels_new', 'district_features_test', 'district_labels', 'neighborhood_labels']


Since the 'district_features' table contains all the necessary transport stop counts and the 'district_attributes' table contains the calculated scaling coefficients, I will use these two tables for the labeling process.

In [13]:
import pandas as pd
# Full table name, including the schema
table_name = 'berlin_labels.district_features'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_features = pd.read_sql(query, engine)

print(district_features.head())

  district_id  bus_tram_stop_count  uban_station_count  bank_count  \
0    11012012                  254                  10          23   
1    11004004                  264                  23          47   
2    11009009                  314                   0          24   
3    11003003                  292                   3          26   
4    11008008                  261                  13          20   

   post_office_count  supermarket_count  mall_count  num_sport_clubs  \
0                 15                 90           6               23   
1                 38                129           4               20   
2                  7                 96           8               47   
3                 34                161           8               19   
4                 18                119           6               14   

   num_gyms  num_pools  
0        14         12  
1        59         15  
2        24         17  
3        83          9  
4        21          

In [14]:
# Full table name, including the schema
table_name = 'berlin_labels.district_attributes'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_attributes = pd.read_sql(query, engine)

print(district_attributes.head())

  district_id  area_sq_km  inhabitants  area_coefficient  \
0    11004004   64.662978       343081          0.871208   
1    11002002   20.389118       293454          0.274704   
2    11011011   52.091363       311881          0.701830   
3    11010010   61.782422       291948          0.832398   
4    11001001   39.379173       397134          0.530558   

   population_coefficient  
0                1.061595  
1                0.908034  
2                0.965053  
3                0.903374  
4                1.228851  


We need to calculate AVG for next steps

In [15]:
avg_bus_tram_stops=district_features['bus_tram_stop_count'].mean()
avg_uban_station=district_features['uban_station_count'].mean()
print(avg_bus_tram_stops,avg_uban_station)

241.58333333333334 11.0


Now we can use tags #bus_tram_hub, #uban_hub and #public_transport_hub

In [13]:
import numpy as np

# --- Step 1: Merge DataFrames ---

# Merge the two tables on district_id
analysis_df = pd.merge(district_features, district_attributes, on='district_id')


# --- Step 2: Calculate "Hub" Status ---

# Condition for the bus/tram hub
is_bus_tram_hub = analysis_df['bus_tram_stop_count'] > (avg_bus_tram_stops * analysis_df['area_coefficient'])

# Condition for the U-Bahn hub
is_uban_hub = analysis_df['uban_station_count'] > (avg_uban_station * analysis_df['area_coefficient'])


# --- Step 3: Apply Hierarchy and Assign Tags ---

# Define conditions in order of priority (from strictest to weakest)
conditions = [
    is_bus_tram_hub & is_uban_hub,  # Condition 1: Is a hub for BOTH transport types
    is_bus_tram_hub,                # Condition 2: Is a hub ONLY for bus/tram
    is_uban_hub                     # Condition 3: Is a hub ONLY for U-Bahn
]

# Define the corresponding choices (tags)
choices = [
    '#public_transport_hub',
    '#bus_tram_hub',
    '#uban_hub'
]

# Use numpy.select to assign the tags based on the conditions
analysis_df['transport_tag'] = np.select(conditions, choices, default='Not a Hub')


# --- Step 4: Display the Result ---
print("Results of transport tagging:")
# Display only the districts that received a tag
print(analysis_df[analysis_df['transport_tag'] != 'Not a Hub'][['district_id', 'transport_tag']])

Results of transport tagging:
   district_id          transport_tag
1     11004004  #public_transport_hub
4     11008008  #public_transport_hub
5     11011011          #bus_tram_hub
6     11010010          #bus_tram_hub
9     11001001  #public_transport_hub
10    11002002  #public_transport_hub
11    11007007              #uban_hub


In [14]:
# --- Step 5: Format Data for Final Table ---

# Filter for rows that actually received a transport tag
final_tags_df = analysis_df[analysis_df['transport_tag'] != 'Not a Hub'].copy()

# Add the category name manually
final_tags_df['category'] = 'Mobility & Accessibility'

# Rename 'transport_tag' to 'label' to match the final table structure
final_tags_df.rename(columns={'transport_tag': 'label'}, inplace=True)

# Select and reorder the columns for the final table
final_tags_df = final_tags_df[['district_id', 'category', 'label']]


# --- Step 6: Upload to the Database ---

# Append the new tags to the existing SQL table
try:
    final_tags_df.to_sql(
        'district_labels_new',
        engine,
        schema='berlin_labels',
        if_exists='append', # Use 'append' to add rows without deleting existing data
        index=False
    )
    print(f"✅ Successfully uploaded {len(final_tags_df)} transport tags.")
except Exception as e:
    print(f"❌ An error occurred during upload: {e}")

# Display the data that was just uploaded
print("\nData uploaded to the database:")
print(final_tags_df)

✅ Successfully uploaded 7 transport tags.

Data uploaded to the database:
   district_id                  category                  label
1     11004004  Mobility & Accessibility  #public_transport_hub
4     11008008  Mobility & Accessibility  #public_transport_hub
5     11011011  Mobility & Accessibility          #bus_tram_hub
6     11010010  Mobility & Accessibility          #bus_tram_hub
9     11001001  Mobility & Accessibility  #public_transport_hub
10    11002002  Mobility & Accessibility  #public_transport_hub
11    11007007  Mobility & Accessibility              #uban_hub
