In [1]:
import sqlalchemy as sa

# Create the connection string for your local database with the new password
db_uri = "postgresql+psycopg2://yana_yelnikova:jPh9p8k6nzjRDe82@localhost:5433/layereddb"

# Create the Engine object, keeping pool_pre_ping for reliability
engine = sa.create_engine(db_uri, pool_pre_ping=True)

print("✅ Connection engine for the local 'layereddb' is ready.")

✅ Connection engine for the local 'layereddb' is ready.


In [3]:
from sqlalchemy import inspect
# Create an inspector object from engine
inspector = inspect(engine)

# Get the list of schema names
schemas = inspector.get_schema_names()

print("Available schemas in the database:")
print(schemas)

Available schemas in the database:
['berlin_labels', 'berlin_recommender', 'berlin_source_data', 'dashboard_data', 'information_schema', 'public']


We are working in 'berlin_source_data' and 'berlin_labels'

In [6]:
# Get the list of table names for the 'berlin_source_data' schema
tables_in_schema = inspector.get_table_names(schema='berlin_source_data')

# Print the list of tables
print("\nTables in schema 'berlin_source_data':")
print(tables_in_schema)


Tables in schema 'berlin_source_data':
['theaters', 'pools_refactored', 'district_level_aggregated', 'district_attributes_test', 'bus_tram_stops', 'malls', 'banks', 'social_clubs_activities', 'veterinary_clinics_martin_svitek', 'hospitals_refactored', 'pharmacies', 'supermarkets', 'bike_lanes', 'pools', 'hospitals', 'land_prices', 'test_table_george_smelin', 'gyms', 'universities', 'venues', 'dental_offices', 'post_offices', 'kindergartens', 'sbahn', 'schools', 'short_term_listings', 'districts', 'ubahn', 'long_term_listings', 'veterinary_clinics', 'milieuschutz_protection_zones', 'neighborhoods', 'parks', 'regional_statistics', 'crime_statistics', 'districts_pop_stat', 'playgrounds', 'rent_stats_per_neighborhood']


In [7]:
# Get the list of table names for the 'berlin_labels' schema
tables_in_schema = inspector.get_table_names(schema='berlin_labels')

# Print the list of tables
print("\nTables in schema 'berlin_labels':")
print(tables_in_schema)


Tables in schema 'berlin_labels':
['district_features', 'district_attributes', 'district_labels_new', 'district_features_test', 'district_labels', 'neighborhood_labels']


Since the 'district_features' table contains all the necessary transport stop counts and the 'district_attributes' table contains the calculated scaling coefficients, I will use these two tables for the labeling process.

In [8]:
import pandas as pd
# Full table name, including the schema
table_name = 'berlin_labels.district_features'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_features = pd.read_sql(query, engine)

print(district_features.head())

  district_id  bus_tram_stop_count  uban_station_count  bank_count  \
0    11012012                  254                  10          23   
1    11004004                  264                  23          47   
2    11009009                  314                   0          24   
3    11003003                  292                   3          26   
4    11008008                  261                  13          20   

   post_office_count  supermarket_count  mall_count  num_sport_clubs  \
0                 15                 90           6               23   
1                 38                129           4               20   
2                  7                 96           8               47   
3                 34                161           8               19   
4                 18                119           6               14   

   num_gyms  num_pools  
0        14         12  
1        59         15  
2        24         17  
3        83          9  
4        21          

In [9]:
# Full table name, including the schema
table_name = 'berlin_labels.district_attributes'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_attributes = pd.read_sql(query, engine)

print(district_attributes.head())

  district_id  area_sq_km  inhabitants  area_coefficient  \
0    11004004   64.662978       343081          0.871208   
1    11002002   20.389118       293454          0.274704   
2    11011011   52.091363       311881          0.701830   
3    11010010   61.782422       291948          0.832398   
4    11001001   39.379173       397134          0.530558   

   population_coefficient  
0                1.061595  
1                0.908034  
2                0.965053  
3                0.903374  
4                1.228851  


In [14]:
avg_banks=district_features['bank_count'].mean()
avg_post_offices=district_features['post_office_count'].mean()
avg_supermarkets=district_features['supermarket_count'].mean()
avg_malls=district_features['mall_count'].mean()
  
print(avg_banks,avg_post_offices, avg_supermarkets, avg_malls)

26.916666666666668 20.333333333333332 112.91666666666667 8.666666666666666


In [16]:
import numpy as np
import pandas as pd

# --- Step 1: Merge DataFrames ---
# Assuming 'district_features' and 'district_attributes' are already loaded
analysis_df = pd.merge(district_features, district_attributes, on='district_id')

# --- Step 2: Calculate "Hub" Statuses ---
# Assuming all average counts (avg_banks, etc.) are already calculated
is_bank_hub = analysis_df['bank_count'] > (avg_banks * analysis_df['area_coefficient'])
is_post_hub = analysis_df['post_office_count'] > (avg_post_offices * analysis_df['area_coefficient'])
is_supermarket_hub = analysis_df['supermarket_count'] > (avg_supermarkets * analysis_df['area_coefficient'])
is_mall_hub = analysis_df['mall_count'] > (avg_malls * analysis_df['area_coefficient'])

# --- Step 3: Calculate Convenience Score (0-3) ---
# The score is based ONLY on the three core amenities
analysis_df['convenience_score'] = (is_bank_hub.astype(int) + 
                                    is_post_hub.astype(int) + 
                                    is_supermarket_hub.astype(int))

# --- Step 4: Apply Hierarchy for Convenience Tags ---
# Define conditions in order of priority
conditions = [
    (analysis_df['convenience_score'] == 3) & is_mall_hub, # Condition for #commercial_hotspot
    (analysis_df['convenience_score'] == 3),               # Condition for #highly_convenient
    (analysis_df['convenience_score'] == 2)                # Condition for #daily_convenience
]

# Define the corresponding tags
choices = [
    '#commercial_hotspot',
    '#highly_convenient',
    '#daily_convenience'
]

# Assign the single best convenience tag
analysis_df['convenience_tag'] = np.select(conditions, choices, default='Not Tagged')

# --- Step 5: Format the result of Part 1 ---
convenience_tags_df = analysis_df[analysis_df['convenience_tag'] != 'Not Tagged'].copy()
convenience_tags_df['category'] = 'Amenities & Services'
convenience_tags_df.rename(columns={'convenience_tag': 'label'}, inplace=True)
convenience_tags_df = convenience_tags_df[['district_id', 'category', 'label']]

print("--- Results from Part 1 (Convenience Tags) ---")
print(convenience_tags_df)

--- Results from Part 1 (Convenience Tags) ---
   district_id              category                label
1     11004004  Amenities & Services   #highly_convenient
3     11003003  Amenities & Services   #daily_convenience
4     11008008  Amenities & Services  #commercial_hotspot
5     11011011  Amenities & Services   #daily_convenience
9     11001001  Amenities & Services  #commercial_hotspot
10    11002002  Amenities & Services  #commercial_hotspot
11    11007007  Amenities & Services   #highly_convenient


In [17]:
# --- Step 1: Define the Stricter Threshold for Malls ---
mall_threshold = avg_malls * analysis_df['area_coefficient'] * 1.5

# --- Step 2: Check if a district is a shopping destination ---
is_shopping_destination = analysis_df['mall_count'] > mall_threshold

# --- Step 3: Assign the tag ---
analysis_df['shopping_tag'] = np.where(is_shopping_destination, '#shopping_destination', 'Not Tagged')

# --- Step 4: Format the result of Part 2 ---
shopping_tags_df = analysis_df[analysis_df['shopping_tag'] != 'Not Tagged'].copy()
shopping_tags_df['category'] = 'Amenities & Services'
shopping_tags_df.rename(columns={'shopping_tag': 'label'}, inplace=True)
shopping_tags_df = shopping_tags_df[['district_id', 'category', 'label']]

print("\n--- Results from Part 2 (Shopping Tag) ---")
print(shopping_tags_df)


--- Results from Part 2 (Shopping Tag) ---
   district_id              category                  label
5     11011011  Amenities & Services  #shopping_destination
6     11010010  Amenities & Services  #shopping_destination
9     11001001  Amenities & Services  #shopping_destination
10    11002002  Amenities & Services  #shopping_destination


In [18]:
# --- Combine both sets of tags into one final DataFrame ---
final_tags_df = pd.concat([convenience_tags_df, shopping_tags_df], ignore_index=True)

print("\n--- Final Combined Tags for Upload ---")
print(final_tags_df)

# --- Upload to the Database ---
try:
    final_tags_df.to_sql(
        'district_labels_new',
        engine,
        schema='berlin_labels',
        if_exists='append',
        index=False
    )
    print(f"\n✅ Successfully uploaded {len(final_tags_df)} amenity tags.")
except Exception as e:
    print(f"\n❌ An error occurred during upload: {e}")


--- Final Combined Tags for Upload ---
   district_id              category                  label
0     11004004  Amenities & Services     #highly_convenient
1     11003003  Amenities & Services     #daily_convenience
2     11008008  Amenities & Services    #commercial_hotspot
3     11011011  Amenities & Services     #daily_convenience
4     11001001  Amenities & Services    #commercial_hotspot
5     11002002  Amenities & Services    #commercial_hotspot
6     11007007  Amenities & Services     #highly_convenient
7     11011011  Amenities & Services  #shopping_destination
8     11010010  Amenities & Services  #shopping_destination
9     11001001  Amenities & Services  #shopping_destination
10    11002002  Amenities & Services  #shopping_destination

✅ Successfully uploaded 11 amenity tags.
