In [2]:
# 📓 Berlin Schools Transformation Notebook (Step 2)
# For PR branch: `schools-data-transformation`

import pandas as pd
import numpy as np


In [3]:
# 🔹 Load the input CSV file

input_path = r"C:\Users\Maxdesk\Desktop\Webeet\berlin_schools\final\berlin_schools.csv"
df = pd.read_csv(input_path)
df

Unnamed: 0,bsn,school_name,school_type_de,ownership_en,school_category_de,district,neighborhood,postal_code,street,street_number,...,students_f,students_m,teachers_total,teachers_f,teachers_m,startchancen_flag,lon,lat,schoolinductionzones_id,district_id
0,01A04,Berlin-Kolleg,Kolleg,Public,Andere Schule,Mitte,Moabit,10551,Turmstraße,75,...,,,,,,False,13.334,52.527,109.0,1.0
1,01B01,"OSZ Banken, Immobilien und Versicherungen",Oberstufenzentrum,Public,Berufsschule,Mitte,Moabit,10557,Alt-Moabit,10,...,642.0,961.0,76.0,41.0,35.0,False,13.358,52.524,46.0,1.0
2,01B02,Staatliche Technikerschule Berlin,Fachschule,Public,Berufsschule,Mitte,Moabit,10555,Bochumer Straße,8B,...,56.0,424.0,57.0,25.0,32.0,False,13.338,52.523,87.0,1.0
3,01B03,"OSZ Kommunikations-, Informations- und Medient...",Oberstufenzentrum,Public,Berufsschule,Mitte,Gesundbrunnen,13359,Osloer Straße,23,...,310.0,916.0,82.0,40.0,42.0,True,13.380,52.557,118.0,1.0
4,01B04,OSZ Gesundheit I,Oberstufenzentrum,Public,Berufsschule,Mitte,Wedding,13349,Schwyzer Straße,6,...,2564.0,338.0,94.0,65.0,29.0,False,13.353,52.558,110.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
922,06P08,Sancta-Maria-Schule der Hedwigschwestern,"Förderschwerp. ""Lernen""u.""Geistige Entwickl.""",Private,Privatschule,Steglitz-Zehlendorf,Nikolassee,14109,Dreilindenstraße,24,...,,,,,,False,13.194,52.422,361.0,6.0
923,06P21,Internationale Montessorischule Berlin (Grunds...,Grundschule,Private,Privatschule,Steglitz-Zehlendorf,Wannsee,14109,Zum Heckeshorn,38,...,,,,,,False,13.163,52.430,377.0,6.0
924,06Y04,Dreilinden-Gymnasium,Gymnasium,Public,Gymnasium,Steglitz-Zehlendorf,Nikolassee,14109,Dreilindenstraße,49,...,369.0,397.0,72.0,46.0,26.0,False,13.188,52.421,361.0,6.0
925,,,,,,,,0,,,...,203194.0,222876.0,35636.0,24598.0,11038.0,,,,,


In [10]:
# Check what columns could  be converted 

df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 927 entries, 0 to 926
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bsn                      925 non-null    string 
 1   school_name              925 non-null    string 
 2   school_type_de           925 non-null    string 
 3   ownership_en             925 non-null    string 
 4   school_category_de       925 non-null    string 
 5   district                 925 non-null    string 
 6   neighborhood             925 non-null    string 
 7   postal_code              927 non-null    int64  
 8   street                   925 non-null    string 
 9   street_number            925 non-null    string 
 10  website                  895 non-null    string 
 11  school_year              925 non-null    string 
 12  school_category_en       925 non-null    string 
 13  total_students           704 non-null    Int64  
 14  students_f               7

In [16]:
# 🛠️ Rename column and check formatting of IDs
# Rename 'schoolinductionzones_id' → 'schoolinductionzone_id' if still wrong
if "schoolinductionzones_id" in df.columns:
    df = df.rename(columns={"schoolinductionzones_id": "schoolinductionzone_id"})

# Ensure both ID fields are still Int64 and do not show .0 decimals

df["schoolinductionzone_id"] = df["schoolinductionzone_id"].astype("Int64")

# 🧪 Optional: Check that district_id values still have leading zero if exported as string later
print(df["district_id"].head(10))  # If you want to inspect

# Reminder: when you export to CSV, pandas may *not* show leading 0 unless you force string type
# If leading zeros are important visually in the output, convert to string like this:
# df["district_id_str"] = df["district_id"].astype("string").str.zfill(2)


0    01
1    01
2    01
3    01
4    01
5    01
6    01
7    01
8    01
9    01
Name: district_id, dtype: string


In [17]:
# 🔄 Transform column types according to schema

df["bsn"] = df["bsn"].astype("string")

df[[
    "school_name", "school_type_de", "ownership_en", "school_category_de",
    "district", "neighborhood", "street", "street_number", "website",
    "school_year", "school_category_en", "startchancen_flag", "district_id"
]] = df[[
    "school_name", "school_type_de", "ownership_en", "school_category_de",
    "district", "neighborhood", "street", "street_number", "website",
    "school_year", "school_category_en", "startchancen_flag", "district_id"
]].astype("string")

df["postal_code"] = df["postal_code"].fillna(0).astype("int64")

df[[
    "total_students", "students_f", "students_m",
    "teachers_total", "teachers_f", "teachers_m", "schoolinductionzone_id"
]] = df[[
    "total_students", "students_f", "students_m",
    "teachers_total", "teachers_f", "teachers_m", "schoolinductionzone_id"
]].astype("Int64") #here i have many nan i think int64 instead float will cause less problems

df["lat"] = df["lat"].astype("float64")
df["lon"] = df["lon"].astype("float64")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 927 entries, 0 to 926
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   bsn                     925 non-null    string 
 1   school_name             925 non-null    string 
 2   school_type_de          925 non-null    string 
 3   ownership_en            925 non-null    string 
 4   school_category_de      925 non-null    string 
 5   district                925 non-null    string 
 6   neighborhood            925 non-null    string 
 7   postal_code             927 non-null    int64  
 8   street                  925 non-null    string 
 9   street_number           925 non-null    string 
 10  website                 895 non-null    string 
 11  school_year             925 non-null    string 
 12  school_category_en      925 non-null    string 
 13  total_students          704 non-null    Int64  
 14  students_f              705 non-null    In

In [11]:
## this cells just was part of the old code here i kept it just documentation wise
# reviewer said it wasn't needed since i extracted lon lat aswell
# also the district_id was included after this notebook and i didn't use this notebook but the file inside scipts (d_id)
# since i didn't want to repeat it here i just reloaded the latest csv inside here, just some minor agjustements in the next cell
# so for cocumentation purpose the the code from the script folder
#import pandas as pd
#import numpy as np

# Load your input CSV
#csv_path =(r"C:\Users\Maxdesk\Desktop\Webeet\berlin_schools\final\berlin_schools.csv")
#df = pd.read_csv(csv_path)

# Map of districts → IDs (with leading zeros)
#district_map = {
#    "Mitte": "01",
#    "Friedrichshain-Kreuzberg": "02",
#    "Pankow": "03",
#    "Charlottenburg-Wilmersdorf": "04",
#    "Spandau": "05",
#    "Steglitz-Zehlendorf": "06",
#    "Tempelhof-Schöneberg": "07",
#    "Neukölln": "08",
#    "Treptow-Köpenick": "09",
#    "Marzahn-Hellersdorf": "10",
#    "Lichtenberg": "11",
#    "Reinickendorf": "12"
#}

# Add the `district_id` column based on name → ID map
#df["district_id"] = df["district"].map(district_map)

# Fill *only* truly empty (blank) fields with np.nan (do NOT touch zeros)
#df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# Drop geometry if present
#df.drop(columns=[col for col in df.columns if "geometry" in col.lower()], errors="ignore", inplace=True)

# keep district_id as string for the future 
#df["district_id"] = df["district_id"].astype(str).str.zfill(2)

# Save final version
#df.to_csv("berlin_schools_did.csv", index=False)
#print("✅ Saved with NaN-filled blanks and proper district IDs.")


✅ Saved with NaN-filled blanks and proper district IDs.


### 

In [24]:
# Creating table and populating it
# Import necessary libraries

import psycopg2
from sqlalchemy import create_engine, text
import warnings

warnings.filterwarnings("ignore")

In [25]:
# 🌐 Set up your NeonDB connection
DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

engine = create_engine(DATABASE_URL)

In [26]:
# 🏗️ Create table (if it doesn't exist)
with engine.connect() as conn:
    conn.execute(text('''
        CREATE TABLE IF NOT EXISTS test_berlin_data.schools (
            bsn VARCHAR(16) PRIMARY KEY,
            school_name VARCHAR(128) NOT NULL,
            school_type_de VARCHAR(64) NOT NULL,
            ownership_en VARCHAR(32) NOT NULL,
            school_category_de VARCHAR(64) NOT NULL,
            district VARCHAR(64) NOT NULL,
            district_id VARCHAR(2) NOT NULL,
            neighborhood VARCHAR(64) NOT NULL,
            postal_code INT NOT NULL,
            street VARCHAR(64) NOT NULL,
            street_number VARCHAR(16) NOT NULL,
            website TEXT,
            school_year VARCHAR(16) NOT NULL,
            school_category_en VARCHAR(64) NOT NULL,
            total_students INT,
            students_f INT,
            students_m INT,
            teachers_total INT,
            teachers_f INT,
            teachers_m INT,
            schoolinductionzone_id INT,
            startchancen_flag VARCHAR(16) NOT NULL,
            lon DOUBLE PRECISION NOT NULL,
            lat DOUBLE PRECISION NOT NULL
        )
    '''))
    conn.commit()


OperationalError: (psycopg2.OperationalError) connection to server at "ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech" (3.131.64.200), port 5432 failed: ERROR:  Your project has exceeded the data transfer quota. Upgrade your plan to increase limits.

(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [27]:
print(df.columns.tolist())

['bsn', 'school_name', 'school_type_de', 'ownership_en', 'school_category_de', 'district', 'neighborhood', 'postal_code', 'street', 'street_number', 'website', 'school_year', 'school_category_en', 'total_students', 'students_f', 'students_m', 'teachers_total', 'teachers_f', 'teachers_m', 'startchancen_flag', 'lon', 'lat', 'schoolinductionzone_id', 'district_id']


In [28]:
# 📤 Export the cleaned & transformed dataset for DB insert (Step 3)
output_path = r"C:\Users\Maxdesk\Desktop\Webeet\berlin_schools\final\berlin_schools.csv"
df.to_csv(output_path, index=False)
print(f"✅ Exported transformed data to {output_path}") #to reup that in my schools folder 

✅ Exported transformed data to C:\Users\Maxdesk\Desktop\Webeet\berlin_schools\final\berlin_schools.csv


In [None]:
# 🚀 Upload your DataFrame to the NeonDB table
df.to_sql(
    name='berlin_schools',
    con=engine,
    schema='test_berlin_data',
    if_exists='append',  # Use 'replace' for testing or wiping before reupload
    index=False
)