In [None]:
#Read the csv file
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine, text
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('ibb_planungsraeume.csv')
df_bez=pd.read_csv('ibb_bezirke_cleaned_temp.csv')
df

In [None]:
# Created new column called street_id, set a random value
df["street_id"] = 1100103
df

In [None]:
# Take first 2 digits,discarding rest 6 digits
#moving district id to street id and making district id unique
df['street_id'] = df['district_id']
df['district_id'] = df['district_id'].astype(str).str[:-6]
df

In [None]:
# Converted Object type in to Integer type for joining tables and to match other table columns data type
df['district_id'] = df['district_id'].astype('int64')
df_bez

In [None]:
#Merging Bezirke subset table into Plannugsraüme table
df_bez_subset = df_bez[["district_id", "district"]]
df=pd.merge(df,df_bez_subset, on='district_id')
df=df.drop_duplicates()
df

In [None]:
# Renaming the column
df.rename(columns={'district_id':'neighborhood_id'}, inplace=True)
df.rename(columns={'neighborhood_id':'district_id'}, inplace=True)
df

In [None]:
df["median_net_rent"] = df["median_net_rent"].replace(
    to_replace=r"^\s*keine Daten\s*$", value=np.nan, regex=True
)

In [None]:
df['median_net_rent'] = df['median_net_rent'].astype('float64')

In [None]:
# Replacing "keine Daten" with actual np.nan (not a string)
df[["median_net_rent", "number_of_cases", "mean_net_rent"]] = df[["median_net_rent", "number_of_cases", "mean_net_rent"]].replace({
    "keine Daten": np.nan
})
df


In [None]:
df = df.rename(columns={'median_net_rent': 'median_net_rent_per_m2'})
df = df.rename(columns={'mean_net_rent': 'mean_net_rent_per_m2'})

In [None]:
df['number_of_cases'] = df['number_of_cases'].astype('float64')
df['neighborhood'] = df['district'].astype('string')
df['street_name'] = df['street_name'].astype('string')
df['median_net_rent_per_m2'] = df['median_net_rent_per_m2'].astype('float64')
df['mean_net_rent_per_m2'] = df['mean_net_rent_per_m2'].astype('float64')

In [None]:
district_id_map = {
    'mitte': '01',
    'friedrichshain-kreuzberg': '02',
    'pankow': '03',
    'charlottenburg-wilmersdorf': '04',
    'spandau': '05',
    'steglitz-zehlendorf': '06',
    'tempelhof-schöneberg': '07',
    'neukölln': '08',
    'treptow-köpenick': '09',
    'marzahn-hellersdorf': '10',
    'lichtenberg': '11',
    'reinickendorf': '12'
}
df['district_id'] = df['district'].map(district_id_map)
df


In [None]:
#Writing to csv file
 
df

In [None]:
df.dtypes

In [None]:
#df = df.drop('district_id', axis=1)

In [None]:
col = 'district'
df = df[[col] + [c for c in df.columns if c != col]]
df

In [None]:
df.to_csv('ibb_planungsraeume_cleaned.csv', index=False)
df

In [None]:
# SQLAlchemy connection string format:
# postgresql+psycopg2://user:password@host:port/dbname

DATABASE_URL = (
    "postgresql+psycopg2://postgres:b319nnlsekSOfIiVDpRC"
    "@layered-data-warehouse.cdg2ok68acsn.eu-central-1.rds.amazonaws.com:5432/berlin_project_db"
    "?sslmode=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [None]:
with engine.begin() as conn:
    conn.execute(text('DROP TABLE IF EXISTS berlin_data.rent_stats_per_street'))

In [None]:
#creating new table
with engine.begin() as conn:
    conn.execute(text('''
    CREATE  TABLE  IF NOT EXISTS  berlin_data.rent_stats_per_street(
        districts VARCHAR(100) NOT NULL,
        street_name VARCHAR(100) NOT NULL,
        median_net_rent_per_m2 DECIMAL(5,2),
        number_of_cases INT,
        mean_net_rent_per_m2 DECIMAL(5,2),
        year SMALLINT NOT NULL,
        street_id INT NOT NULL,
        CONSTRAINT districts_fk FOREIGN KEY (districts) REFERENCES berlin_data.districts(districts) ON DELETE RESTRICT ON UPDATE CASCADE)
    '''))

In [None]:
conn.commit()

In [None]:
# Populate NeonDB with rent_stats_per_street data

df.to_sql(
    name='rent_stats_per_street',
    con=engine,
    schema='test_berlin_data',
    if_exists='append',
    index=False
)