In [65]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine, text
import warnings

warnings.filterwarnings("ignore")

In [66]:
# 1. Load your DataFrame
df = pd.read_csv("/Users/sums/Downloads/immowelt_longterm_listings.csv", sep=",")  # comma separator

In [67]:
# 2. Connection settings
user_name = "sumi_dhakal"
password = "0n6vTrvHmQl3gt"
host = "localhost"      # because SSM tunnel forwards here
port = "5433"           
database = "layereddb"
schema = "berlin_source_data"


In [68]:
# Create SQLAlchemy engine
engine = create_engine(
    f"postgresql+psycopg2://{user_name}:{password}@{host}:{port}/{database}")

In [69]:

# Remove duplicate columns (keeps the first occurrence)
df = df.loc[:, ~df.columns.duplicated()]

# Optional: check columns
print(df.columns.tolist())



['detail_url', 'raw_info', 'type', 'first_tenant', 'price_euro', 'number_of_rooms', 'surface_m2', 'floor', 'street', 'house_number', 'neighborhood', 'district', 'postal_code', 'city', 'address', 'latitude', 'longitude', 'geometry', 'district_id', 'listing_id']


In [70]:
df.describe

<bound method NDFrame.describe of                                              detail_url  \
0     https://www.immowelt.de/expose/60a4680b-c666-4...   
1     https://www.immowelt.de/expose/48b02494-b932-4...   
2     https://www.immowelt.de/expose/fb32adca-5d67-4...   
3     https://www.immowelt.de/expose/4985f014-94b9-4...   
4     https://www.immowelt.de/expose/2144d17c-ae55-4...   
...                                                 ...   
1162  https://www.immowelt.de/expose/645c9ad8-9a11-4...   
1163  https://www.immowelt.de/expose/055696ce-efd5-4...   
1164  https://www.immowelt.de/expose/b099a0db-45f4-4...   
1165  https://www.immowelt.de/expose/21fc6a2d-d87a-4...   
1166  https://www.immowelt.de/expose/d81a02d9-be37-4...   

                                               raw_info     type first_tenant  \
0     Wohnung zur Miete - Erstbezug 1.840 € 2 Zimmer...  Wohnung          yes   
1     WG-Zimmer zur Miete Wohnen auf Zeit 1.199 € 20...       WG           no   
2     Wohnung 

In [75]:
# 3. Create target table
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {schema}.long_term_listings(
    listing_id VARCHAR PRIMARY KEY,
    detail_url TEXT,
    raw_info TEXT,
    type VARCHAR,
    first_tenant VARCHAR,
    price_euro INTEGER,
    number_of_rooms FLOAT,
    surface_m2 FLOAT,
    floor FLOAT,
    street VARCHAR,
    house_number VARCHAR,
    neighborhood VARCHAR,
    district VARCHAR,
    postal_code INTEGER,
    city VARCHAR,
    address TEXT,
    latitude FLOAT,
    longitude FLOAT,
    geometry TEXT,
    district_id TEXT,
    CONSTRAINT district_id_fk FOREIGN KEY (district_id)
    REFERENCES berlin_source_data.districts(district_id)
    ON DELETE RESTRICT
    ON UPDATE CASCADE
);
"""

with engine.connect() as conn:
    conn.execute(text(create_table_query))
    conn.commit()

In [76]:
# 4. Upload DataFrame to DB
df.to_sql(
    "long_term_listings",
    engine,
    schema=schema,
    if_exists="append",   # append to existing
    index=False
)

print("✅ Listings uploaded successfully!")

# 5. Test query
query = f"""
SELECT listing_id, price_euro, neighborhood, district
FROM {schema}.long_term_listings
LIMIT 5;
"""

with engine.connect() as conn:
    df_check = pd.read_sql(text(query), conn)

print("Sample rows from DB:")
print(df_check)

✅ Listings uploaded successfully!
Sample rows from DB:
           listing_id  price_euro         neighborhood  \
0   WOH_1840_76_14059        1840       Charlottenburg   
1    WG_1199_20_10625        1199       Charlottenburg   
2  WOH_2772_178_14055        2772              Westend   
3    STU_495_43_13627         495  Charlottenburg-Nord   
4   STU_1625_44_10717        1625          Wilmersdorf   

                     district  
0  Charlottenburg-Wilmersdorf  
1  Charlottenburg-Wilmersdorf  
2  Charlottenburg-Wilmersdorf  
3  Charlottenburg-Wilmersdorf  
4  Charlottenburg-Wilmersdorf  
