In [None]:
import sqlalchemy as sa

# Create the connection string with placeholders for credentials
db_uri = "postgresql+psycopg2://<USERNAME>:<PASSWORD>@localhost:5433/layereddb"

# Create the Engine object, keeping pool_pre_ping for reliability
engine = sa.create_engine(db_uri, pool_pre_ping=True)

print("✅ Connection engine for the local 'layereddb' is ready.")

✅ Connection engine for the local 'layereddb' is ready.


In [6]:
from sqlalchemy import inspect
# Create an inspector object from engine
inspector = inspect(engine)

# Get the list of schema names
schemas = inspector.get_schema_names()

print("Available schemas in the database:")
print(schemas)

Available schemas in the database:
['berlin_labels', 'berlin_recommender', 'berlin_source_data', 'dashboard_data', 'information_schema', 'public']


In [7]:
# Get the list of table names for the 'berlin_source_data' schema
tables_in_schema = inspector.get_table_names(schema='berlin_source_data')

# Print the list of tables
print("\nTables in schema 'berlin_source_data':")
print(tables_in_schema)


Tables in schema 'berlin_source_data':
['theaters', 'pools_refactored', 'theaters_backup_neigh_final', 'district_level_aggregated', 'district_attributes_test', 'bus_tram_stops', 'malls', 'banks', 'social_clubs_activities', 'veterinary_clinics_martin_svitek', 'hospitals_refactored', 'pharmacies', 'supermarkets', 'bike_lanes', 'pools', 'hospitals', 'land_prices', 'test_table_george_smelin', 'gyms', 'universities', 'venues', 'dental_offices', 'post_offices', 'kindergartens', 'sbahn', 'schools', 'short_term_listings', 'districts', 'ubahn', 'long_term_listings', 'veterinary_clinics', 'milieuschutz_protection_zones', 'neighborhoods', 'parks', 'regional_statistics', 'crime_statistics', 'districts_pop_stat', 'playgrounds', 'rent_stats_per_neighborhood']


We are working in 'berlin_source_data' and 'berlin_labels'

In [29]:
# Get the list of table names for the 'berlin_source_data' schema
tables_in_schema = inspector.get_table_names(schema='berlin_source_data')

# Print the list of tables
print("\nTables in schema 'berlin_source_data':")
print(tables_in_schema)


Tables in schema 'berlin_source_data':
['theaters', 'pools_refactored', 'theaters_backup_neigh_final', 'district_level_aggregated', 'district_attributes_test', 'bus_tram_stops', 'malls', 'banks', 'social_clubs_activities', 'veterinary_clinics_martin_svitek', 'hospitals_refactored', 'pharmacies', 'supermarkets', 'bike_lanes', 'pools', 'hospitals', 'land_prices', 'test_table_george_smelin', 'gyms', 'universities', 'venues', 'dental_offices', 'post_offices', 'kindergartens', 'sbahn', 'schools', 'short_term_listings', 'districts', 'ubahn', 'long_term_listings', 'veterinary_clinics', 'milieuschutz_protection_zones', 'neighborhoods', 'parks', 'regional_statistics', 'crime_statistics', 'districts_pop_stat', 'playgrounds', 'rent_stats_per_neighborhood']


In [30]:
# Get the list of table names for the 'berlin_labels' schema
tables_in_schema = inspector.get_table_names(schema='berlin_labels')

# Print the list of tables
print("\nTables in schema 'berlin_labels':")
print(tables_in_schema)


Tables in schema 'berlin_labels':
['district_attributes', 'district_labels_new', 'district_features', 'district_features_test', 'district_labels', 'neighborhood_labels']


Since the 'district_features' table contains total_crime_cases_latest_year and the 'district_attributes' table contains the inhabitants per district, I will use these two tables for the labeling process.

In [31]:
import pandas as pd
# Full table name, including the schema
table_name = 'berlin_labels.district_features'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_features = pd.read_sql(query, engine)

print(district_features.head())

  district_id  bus_tram_stop_count  uban_station_count  sbahn_station_count  \
0    11012012                  254                  10                   38   
1    11004004                  264                  23                   37   
2    11009009                  314                   0                   49   
3    11003003                  292                   3                   37   
4    11008008                  261                  13                    8   

   bank_count  post_office_count  supermarket_count  mall_count  \
0          23                 15                 90           6   
1          47                 38                129           4   
2          24                  7                 96           8   
3          26                 34                161           8   
4          20                 18                119           6   

   num_sport_clubs  num_gyms  num_pools  hospital_count  pharmacy_count  \
0               23        14         12        

In [32]:
# Full table name, including the schema
table_name = 'berlin_labels.district_attributes'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_attributes = pd.read_sql(query, engine)

print(district_attributes.head())

  district_id  area_sq_km  inhabitants  area_coefficient  \
0    11004004   64.662978       343081          0.871208   
1    11002002   20.389118       293454          0.274704   
2    11011011   52.091363       311881          0.701830   
3    11010010   61.782422       291948          0.832398   
4    11001001   39.379173       397134          0.530558   

   population_coefficient  
0                1.061595  
1                0.908034  
2                0.965053  
3                0.903374  
4                1.228851  


The #low_crime_rate tag is assigned by first calculating the overall crime rate per 100,000 inhabitants for each district using the latest available year's data. Then, the 25th percentile of these crime rates across all districts is determined. Districts with a crime rate below this 25th percentile threshold receive the tag.

In [33]:
# --- Step 1: Prepare the data ---

# Select only the needed columns from each DataFrame
features_needed = district_features[['district_id', 'total_crime_cases_latest_year']]
attributes_needed = district_attributes[['district_id', 'inhabitants']]

# Merge the crime counts with the population data
analysis_df = pd.merge(
    attributes_needed,
    features_needed,
    on='district_id',
    how='left' # Keep all districts even if crime data were missing
)

# Replace any potential missing crime counts with 0 (just in case)
analysis_df['total_crime_cases_latest_year'] = analysis_df['total_crime_cases_latest_year'].fillna(0)

# --- Step 2: Calculate the crime rate per 100k inhabitants ---
# Initialize the column as float
analysis_df['crime_rate_100k'] = 0.0
# Calculate rate where population > 0
analysis_df.loc[analysis_df['inhabitants'] > 0, 'crime_rate_100k'] = \
    (analysis_df['total_crime_cases_latest_year'] / analysis_df['inhabitants']) * 100000

# --- Step 3: Display the result ---
print("Crime rate per 100,000 inhabitants for the latest year:")
# Sort by crime rate for better readability
print(analysis_df[['district_id', 'total_crime_cases_latest_year', 'inhabitants', 'crime_rate_100k']]
      .round(2)
      .sort_values(by='crime_rate_100k'))

Crime rate per 100,000 inhabitants for the latest year:
   district_id  total_crime_cases_latest_year  inhabitants  crime_rate_100k
4     11001001                          71652       397134         18042.27
9     11006006                         104096       310446         33531.11
11    11009009                         119880       294081         40764.28
6     11003003                         173850       424307         40972.69
8     11005005                         115046       257091         44749.14
10    11007007                         181424       355868         50980.70
7     11012012                         147324       268792         54809.67
5     11008008                         204184       330017         61870.75
0     11004004                         233294       343081         67999.69
2     11011011                         216205       311881         69322.91
1     11002002                         256856       293454         87528.54
3     11010010                  

In [35]:
import numpy as np

# --- Step 1: Calculate the 25th percentile threshold ---
# .quantile(0.25) calculates the value below which 25% of the data falls
crime_rate_threshold = analysis_df['crime_rate_100k'].quantile(0.25)

print(f"The 25th percentile threshold for crime rate is: {crime_rate_threshold:.2f}")

# --- Step 2: Check which districts meet the criteria ---
# Find districts where the crime rate is less than the threshold
is_low_crime = analysis_df['crime_rate_100k'] < crime_rate_threshold

# --- Step 3: Assign the tag ---
# Use np.where to assign the tag if the condition is met
analysis_df['crime_tag'] = np.where(is_low_crime, '#low_crime_rate', 'Not Tagged')

# --- Step 4: Display the result ---
print("\nDistricts tagged with #low_crime_rate:")
print(analysis_df[analysis_df['crime_tag'] != 'Not Tagged'][['district_id', 'crime_rate_100k', 'crime_tag']].round(2))

# --- Step 5: Prepare data for database upload ---
# Filter for tagged districts and format for the final table
crime_tags_df = analysis_df[analysis_df['crime_tag'] != 'Not Tagged'].copy()
crime_tags_df['category'] = 'Community & Lifestyle'

# Rename the 'crime_tag' column to 'label'
crime_tags_df.rename(columns={'crime_tag': 'label'}, inplace=True)

# Select the final columns in the correct order
crime_tags_df = crime_tags_df[['district_id', 'category', 'label']]

print("\nDataFrame ready for database upload:")
print(crime_tags_df)

The 25th percentile threshold for crime rate is: 40920.59

Districts tagged with #low_crime_rate:
   district_id  crime_rate_100k        crime_tag
4     11001001         18042.27  #low_crime_rate
9     11006006         33531.11  #low_crime_rate
11    11009009         40764.28  #low_crime_rate

DataFrame ready for database upload:
   district_id               category            label
4     11001001  Community & Lifestyle  #low_crime_rate
9     11006006  Community & Lifestyle  #low_crime_rate
11    11009009  Community & Lifestyle  #low_crime_rate


In [36]:
# --- Step 6: Upload to the Database ---
print("\nAttempting to upload crime tags to the database...")
try:
    crime_tags_df.to_sql(
        'district_labels_new',
        engine,
        schema='berlin_labels',
        if_exists='append', # Add new rows
        index=False
    )
    print(f"✅ Successfully uploaded {len(crime_tags_df)} #low_crime_rate tags.")
except Exception as e:
    print(f"❌ An error occurred during upload: {e}")
    # Consider adding rollback or specific error handling if needed


Attempting to upload crime tags to the database...
✅ Successfully uploaded 3 #low_crime_rate tags.
