In [63]:
# Import pandas library and create a data frame from statistical_regional_data_berlin.csv:

import pandas as pd

df = pd.read_csv("../sources/statistical_regional_data_berlin.csv")
df

Unnamed: 0,neighbourhood,year,inhabitants,total_area_ha,share_forest_water_agriculture,forest_area_ha,water_area_ha,agriculture_area_ha,population_density_per_ha,number_of_residences,living_space_per_resident_m2
0,Mitte,2012,329969,3947.0,3.60%,–,142.0,–,83.6,185209.0,39.2
1,Friedrichshain-Kreuzberg,2012,259483,2034.0,6.70%,4,132.0,1,127.6,145328.0,39.3
2,Pankow,2012,364794,10307.0,32.60%,1350,159.0,8,35.4,205778.0,41.2
3,Charlottenburg-Wilmersdorf,2012,298567,6472.0,29.50%,1622,281.0,794,46.1,184637.0,48.9
4,Spandau,2012,218935,9187.0,37.00%,1706,897.0,26,23.8,117907.0,40.2
...,...,...,...,...,...,...,...,...,...,...,...
139,Neukölln,2023,330017,4493.0,2.80%,2,86.0,40,73.0,167543.0,35.5
140,Treptow-Köpenick,2023,294081,16773.0,53.00%,6738,2045.0,99,18.0,156747.0,38.5
141,Marzahn-Hellersdorf,2023,291948,6182.0,4.50%,79,126.0,73,47.0,148053.0,36.2
142,Lichtenberg,2023,311881,5212.0,13.60%,55,109.0,547,60.0,166121.0,34.8


In [64]:
# Change column name `neighbourhood` to `district`:

df = df.rename(columns={'neighbourhood': 'district'})

In [65]:
# Replace '-' with 0 in specific columns and change type to int64:

area_columns = [
    'forest_area_ha',
    'water_area_ha',
    'agriculture_area_ha'
]

df[area_columns] = df[area_columns].replace('–', 0)

In [66]:
# Remove '%' in share_forest_water_agriculture:

df["share_forest_water_agriculture"] = df["share_forest_water_agriculture"].str.replace('%', '')

In [67]:
# Check what columns need to be converted to float
# (it's 'share_forest_water_agriculture', 'forest_area_ha' and 'agriculture_area_ha'):

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   district                        144 non-null    object 
 1   year                            144 non-null    int64  
 2   inhabitants                     144 non-null    int64  
 3   total_area_ha                   120 non-null    float64
 4   share_forest_water_agriculture  120 non-null    object 
 5   forest_area_ha                  120 non-null    object 
 6   water_area_ha                   120 non-null    float64
 7   agriculture_area_ha             120 non-null    object 
 8   population_density_per_ha       120 non-null    float64
 9   number_of_residences            120 non-null    float64
 10  living_space_per_resident_m2    120 non-null    float64
dtypes: float64(5), int64(2), object(4)
memory usage: 12.5+ KB


In [68]:
# Filling missing values for 2018:

# List of columns to average for 2018 (excluding 'inhabitants', 'year' and 'neighborhood'):

cols_to_avg = [
    'total_area_ha',
    'share_forest_water_agriculture',
    'forest_area_ha',
    'water_area_ha',
    'agriculture_area_ha',
    'population_density_per_ha',
    'number_of_residences',
    'living_space_per_resident_m2'
]

# Convert 'share_forest_water_agriculture', 'forest_area_ha', 'agriculture_area_ha' to float:

for col in ['share_forest_water_agriculture', 'forest_area_ha', 'agriculture_area_ha']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Get data for years 2017 and 2019:

df_2017 = df[df['year'] == 2017].set_index('district')
df_2019 = df[df['year'] == 2019].set_index('district')
df_2018 = df[df['year'] == 2018].set_index('district')

# Calculate the average for columns except 'inhabitants' for 2018 neighborhoods using 2017 and 2019:

avg_2018 = (df_2017[cols_to_avg] + df_2019[cols_to_avg]) / 2

# Keep the inhabitants column from existing 2018 data

avg_2018['inhabitants'] = df_2018['inhabitants']

# Set the year column to 2018

avg_2018['year'] = 2018

# Reset index to merge back properly

avg_2018 = avg_2018.reset_index()

# Now replace original 2018 rows in df with these averaged rows:

df_no_2018 = df[df['year'] != 2018]
df = pd.concat([df_no_2018, avg_2018], ignore_index=True)

# Sort by year and neighborhood:

df = df.sort_values(by=['year', 'district']).reset_index(drop=True)


In [69]:
# Filling missing values for 2021:

# Get data for years 2020 and 2022:

df_2020 = df[df['year'] == 2020].set_index('district')
df_2022 = df[df['year'] == 2022].set_index('district')
df_2021 = df[df['year'] == 2021].set_index('district')

# Calculate the average for columns except 'inhabitants' for 2021 neighborhoods using 2020 and 2022:

avg_2021 = (df_2020[cols_to_avg] + df_2022[cols_to_avg]) / 2

# Keep the inhabitants column from existing 2021 data:

avg_2021['inhabitants'] = df_2021['inhabitants']

# Set the year column to 2021:

avg_2021['year'] = 2021

# Reset index to merge back properly:

avg_2021 = avg_2021.reset_index()

# Replace original 2021 rows in df with these averaged rows:

df_no_2021 = df[df['year'] != 2021]
df = pd.concat([df_no_2021, avg_2021], ignore_index=True)

# Sort by year and neighborhood:

df = df.sort_values(by=['year', 'district']).reset_index(drop=True)


In [70]:
# Round the columns and change types:

columns_int = [
    'year',
    'inhabitants',
    'total_area_ha',
    'forest_area_ha',
    'water_area_ha',
    'agriculture_area_ha',
    'number_of_residences'
]

columns_1d = [
    'population_density_per_ha',
    'living_space_per_resident_m2'
]

for col in columns_int:
    if col in df.columns:
        df[col] = df[col].round(0).astype('Int64')

for col in columns_1d:
    if col in df.columns:
        df[col] = df[col].round(1)

In [71]:
# Change share_forest_water_agriculture into percentage values (0.036 instead of 3.6%):

df["share_forest_water_agriculture"] = (df["share_forest_water_agriculture"] / 100).round(3)

In [72]:
# Add `district_id` column:

district_map = {
    "Mitte": "01",
    "Friedrichshain-Kreuzberg": "02",
    "Pankow": "03",
    "Charlottenburg-Wilmersdorf": "04",
    "Spandau": "05",
    "Steglitz-Zehlendorf": "06",
    "Tempelhof-Schöneberg": "07",
    "Neukölln": "08",
    "Treptow-Köpenick": "09",
    "Marzahn-Hellersdorf": "10",
    "Lichtenberg": "11",
    "Reinickendorf": "12"
}

df["district_id"] = df["district"].map(district_map).astype(str)

In [73]:
# Convert 'district' column to lowercase:

df['district'] = df['district'].str.lower()

In [74]:
# Check data frame info:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   district                        144 non-null    object 
 1   year                            144 non-null    Int64  
 2   inhabitants                     144 non-null    Int64  
 3   total_area_ha                   144 non-null    Int64  
 4   share_forest_water_agriculture  144 non-null    float64
 5   forest_area_ha                  144 non-null    Int64  
 6   water_area_ha                   144 non-null    Int64  
 7   agriculture_area_ha             144 non-null    Int64  
 8   population_density_per_ha       144 non-null    float64
 9   number_of_residences            144 non-null    Int64  
 10  living_space_per_resident_m2    144 non-null    float64
 11  district_id                     144 non-null    object 
dtypes: Int64(7), float64(3), object(2)
m

In [75]:
# Save to csv (if needed):

# df.to_csv('regional_statistics.csv')

In [76]:
# Creating table and populating it:
# Import necessary libraries

import psycopg2
from sqlalchemy import create_engine, text
import warnings

warnings.filterwarnings("ignore")

In [77]:
# Create connection and engine to establish a connection

with open("../../../_db_login/layered_db_url.txt", "r") as file:
    DATABASE_URL = file.read().strip()

engine = create_engine(DATABASE_URL)

In [78]:
# Create empty regional_statistics table with constraints

with engine.connect() as conn:
    conn.execute(text('''
    CREATE TABLE IF NOT EXISTS berlin_data.regional_statistics (
        district_id VARCHAR(2) NOT NULL,
        district VARCHAR(32) NOT NULL,
        year SMALLINT NOT NULL CHECK (year >= 2000 AND year <= EXTRACT(YEAR FROM CURRENT_DATE)),
        inhabitants INT NOT NULL CHECK (inhabitants >= 0),
        total_area_ha INT NOT NULL CHECK (total_area_ha >= 0),
        share_forest_water_agriculture DECIMAL(10, 3)
            CHECK (share_forest_water_agriculture >= 0 AND share_forest_water_agriculture <= 1),
        forest_area_ha INT CHECK (forest_area_ha >= 0),
        water_area_ha INT CHECK (water_area_ha >= 0),
        agriculture_area_ha INT CHECK (agriculture_area_ha >= 0),
        population_density_per_ha DECIMAL(10, 2) NOT NULL CHECK (population_density_per_ha > 0),
        number_of_residences INT NOT NULL CHECK (number_of_residences > 0),
        living_space_per_resident_m2 DECIMAL(10, 2) NOT NULL CHECK (living_space_per_resident_m2 > 0),
        CONSTRAINT district_id_fk
            FOREIGN KEY (district_id)
            REFERENCES berlin_data.districts(district_id)
            ON DELETE RESTRICT
            ON UPDATE CASCADE
    )
    '''))
    conn.commit()

In [79]:
# Populate aws berlin_project_db with regional_statistics data

df.to_sql(
    name='regional_statistics',
    con=engine,
    schema='berlin_data',
    if_exists='append',
    index=False
)

144