# PUSH ALL DATA TO DATABASE

In [1]:
"""
Upload processed CSV files into PostgreSQL tables.

This script will:
1. Load database connection parameters from a .env file.
2. Create a SQLAlchemy engine.
3. Iterate over every CSV in ../data/processed.
4. Read each CSV as text-only into a pandas DataFrame.
5. Map every column to SQL TEXT type.
6. Replace (or create) the corresponding table in the target schema.
"""

from pathlib import Path
from dotenv import dotenv_values
import pandas as pd
import re
from sqlalchemy import create_engine
from sqlalchemy.types import Text

## LOAD CONFIG

In [2]:
# Read .env into a dict of strings
config    = dotenv_values()

# Extract Postgres credentials / connection info
pg_user   = config['POSTGRES_USER']
pg_pass   = config['POSTGRES_PASS']
pg_host   = config['POSTGRES_HOST']
pg_port   = config['POSTGRES_PORT']
pg_db     = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']

In [3]:
# Build the SQLAlchemy database URL
db_url = f"postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}"

# This engine will manage connections & SQL execution
engine = create_engine(db_url)

## PROCESS CSV FILES

In [4]:
DATA_DIR  = Path("../data/processed/,")
FILES     = ["_handelsnamen_pkw.csv", "_modellreihen.csv"]
KEEP_TEXT = 5

def _to_float(col: pd.Series) -> pd.Series:
    dash_rx = re.compile(r"^[-\u2013\u2014]$")
    col = col.str.strip()
    col = col.mask(col.str.match(dash_rx) | (col == "."), pd.NA)
    col = col.str.replace(r"\s|\.", "", regex=True)
    col = col.str.replace(",", ".", regex=False)
    return pd.to_numeric(col, errors="coerce")

for fname in FILES:
    path = DATA_DIR / fname
    if not path.exists():
        print(f"!! {fname} not found")
        continue

    df = pd.read_csv(path, dtype=str)
    df.columns = df.columns.str.strip()

    zs_mask = df.columns.str.contains(r"ZS\s|\sZS", case=False, regex=True)
    if zs_mask.any():
        df = df.loc[:, ~zs_mask]

    numeric_cols = df.columns[KEEP_TEXT:]
    for col in numeric_cols:
        df[col] = _to_float(df[col])

    df.to_csv(path, index=False, encoding="utf-8")
    df.info()

print("\nReady.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517655 entries, 0 to 517654
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Berichtszeitpunkt  517655 non-null  object
 1   Hersteller         517655 non-null  object
 2   Handelsname        517655 non-null  object
 3   Typschlüssel       517655 non-null  object
 4   Bundesland         517655 non-null  object
 5   Anzahl             517655 non-null  int64 
 6   ObjectId           517655 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 27.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21924 entries, 0 to 21923
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Berichtsjahr                  21924 non-null  object 
 1   Berichtsmonat                 21924 non-null  object 
 2   Segment                       21924 non-null  object 
 3 

In [5]:
data_dir = Path("../data/processed/")

In [6]:
# Walk both “;” and “,” subfolders
for csv_path in data_dir.glob("*/*.csv"):
    # derive table name from filename
    table_name = csv_path.stem.lower().replace("-", "_")

    # determine delimiter from the parent folder name
    sep = csv_path.parent.name  # either ";" or ","

    # read CSV as text, with the correct delimiter
    df = pd.read_csv(
        csv_path,
        # dtype=str,
        sep=sep,
        engine="python",
        on_bad_lines="warn",
        encoding="utf-8"
    )

    # map every column to TEXT in Postgres
    # dtype_dict = {col: Text() for col in df.columns}
    
    # write (replace) into the target schema
    df.to_sql(
        name      = table_name,
        con       = engine,
        schema    = pg_schema,
        if_exists = "replace",
        index     = False,
        # dtype     = dtype_dict
    )
    
    print(f"Uploaded: {pg_schema}.{table_name}")

  df.to_sql(


AttributeError: 'Engine' object has no attribute 'cursor'