In [5]:
import pandas as pd
import geopandas as gpd
import os
from datetime import datetime

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [6]:
from sqlalchemy import create_engine, text, inspect
from sqlalchemy.exc import SQLAlchemyError
import psycopg2

## 11. 🔌 Database Connection Setup (Step 2/5)

### 🎯 **What We'll Do in This Step:**
Learn how to create secure database connections and handle connection parameters.

### 📚 **Key Concepts You'll Learn:**
1. **Connection Strings** - How to format PostgreSQL URLs
2. **Environment Variables** - Secure way to store credentials
3. **Connection Testing** - Verify database connectivity
4. **Error Handling** - What to do when connections fail

### 🧠 **Database Connection Components:**
- **Host**: Where your PostgreSQL server is running (localhost, IP, or domain)
- **Port**: Usually 5432 for PostgreSQL
- **Database**: Name of your specific database
- **Username/Password**: Your credentials
- **PostGIS**: Spatial extension that must be enabled

### 🔒 **Security Best Practices:**
- Never hardcode passwords in notebooks
- Use environment variables or config files
- Test connections before proceeding

### 🔧 **What This Step Accomplishes:**
- Create a connection string template
- Test database connectivity
- Handle connection errors gracefully
- Prepare for schema creation

**Ready to connect to your database?**

In [7]:
# 🔌 Step 2: Database Connection Setup

print("🔌 DATABASE CONNECTION SETUP")
print("=" * 40)

# Database connection parameters
# 🔒 SECURITY NOTE: In production, use environment variables!
print("📋 Setting up connection parameters...")

# Neon database connection (learning environment)
DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

# For display purposes, parse the URL components
DB_CONFIG = {
    'host': 'ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech',
    'port': '5432',
    'database': 'neondb',
    'username': 'neondb_owner',
    'password': 'npg_CeS9fJg2azZD'
}

print(f"   🖥️  Host: {DB_CONFIG['host']}")
print(f"   🔌 Port: {DB_CONFIG['port']}")
print(f"   🗄️  Database: {DB_CONFIG['database']}")
print(f"   👤 Username: {DB_CONFIG['username']}")
print(f"   🔒 Password: {'*' * len(DB_CONFIG['password'])}")

# Create connection string
connection_string = DATABASE_URL
engine = create_engine(connection_string, echo=False)

print(f"\n🔗 Connection String Format:")
print(f"   postgresql+psycopg2://username:password@host:port/database")

# Test connection (without actually connecting yet)
print(f"\n🧪 TESTING CONNECTION SETUP:")
try:
    # Create engine (this doesn't connect yet, just validates the URL)
    engine = create_engine(connection_string, echo=False)
    print("✅ Connection string format is valid!")
    
    # Test if we can actually connect
    print("🔍 Testing actual database connection...")
    
    with engine.connect() as conn:
        # Test basic connection
        result = conn.execute(text("SELECT version();"))
        version = result.fetchone()[0]
        print(f"✅ Connected successfully!")
        print(f"   📊 PostgreSQL version: {version[:50]}...")
        
        # Check if PostGIS is available
        try:
            result = conn.execute(text("SELECT PostGIS_version();"))
            postgis_version = result.fetchone()[0]
            print(f"✅ PostGIS is available!")
            print(f"   🗺️  PostGIS version: {postgis_version}")
        except Exception as e:
            print(f"⚠️  PostGIS not detected - you may need to enable it")
            print(f"   💡 Run: CREATE EXTENSION IF NOT EXISTS postgis;")
            
except SQLAlchemyError as e:
    print(f"❌ Database connection failed!")
    print(f"   Error type: {type(e).__name__}")
    print(f"   Details: {str(e)[:100]}...")
    print(f"\n💡 TROUBLESHOOTING TIPS:")
    print(f"   1. Check if PostgreSQL is running")
    print(f"   2. Verify host, port, username, password")
    print(f"   3. Ensure database '{DB_CONFIG['database']}' exists")
    print(f"   4. Check firewall/network settings")
    
except Exception as e:
    print(f"❌ Unexpected error: {str(e)[:100]}...")

print(f"\n📋 NEXT STEP: Create database schema and tables")
print(f"💡 TIP: If connection failed, fix the issue before proceeding!")

🔌 DATABASE CONNECTION SETUP
📋 Setting up connection parameters...
   🖥️  Host: ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech
   🔌 Port: 5432
   🗄️  Database: neondb
   👤 Username: neondb_owner
   🔒 Password: ****************

🔗 Connection String Format:
   postgresql+psycopg2://username:password@host:port/database

🧪 TESTING CONNECTION SETUP:
✅ Connection string format is valid!
🔍 Testing actual database connection...
✅ Connected successfully!
   📊 PostgreSQL version: PostgreSQL 17.5 on x86_64-pc-linux-gnu, compiled b...
✅ PostGIS is available!
   🗺️  PostGIS version: 3.5 USE_GEOS=1 USE_PROJ=1 USE_STATS=1

📋 NEXT STEP: Create database schema and tables
💡 TIP: If connection failed, fix the issue before proceeding!


## 🔍 Investigating Existing `test_berlin_data` Schema

Based on your ERD file, I can see that the `test_berlin_data` schema already exists with multiple tables! Let's examine what's already there before we proceed.

### 📋 **Tables I see in your ERD:**
- `neighborhood` (the foundation table other tables reference)
- `crime_statistics`
- `green_spaces` 
- `hospitals`
- `land_prices`
- `long_term_rentals`
- `neighborhood_pop_stat`
- `playgrounds`
- `regional_statistics`
- `rent_stats_per_neighborhood`
- `rent_stats_per_neighborhood_kai`
- `rent_stats_per_street`
- `short_time_listings`
- `ubahn`

All of these tables have foreign key relationships pointing to the `neighborhood` table. Let's check what's actually in your database now!

In [8]:
# 🔍 Let's check what's in the test_berlin_data schema

print("🔍 INVESTIGATING EXISTING test_berlin_data SCHEMA")
print("=" * 50)

try:
    with engine.connect() as conn:
        # 1. Check if test_berlin_data schema exists
        print("\n📊 1. CHECKING SCHEMA EXISTENCE")
        print("-" * 40)
        
        schema_exists = conn.execute(text("""
            SELECT schema_name 
            FROM information_schema.schemata 
            WHERE schema_name = 'test_berlin_data'
        """)).fetchone()
        
        if schema_exists:
            print("✅ test_berlin_data schema EXISTS!")
        else:
            print("❌ test_berlin_data schema NOT FOUND")
            print("💡 We may need to create it or use a different schema")
        
        # 2. List all tables in test_berlin_data schema
        print("\n📋 2. TABLES IN test_berlin_data SCHEMA")
        print("-" * 40)
        
        tables_result = conn.execute(text("""
            SELECT table_name, table_type
            FROM information_schema.tables 
            WHERE table_schema = 'test_berlin_data'
            ORDER BY table_name
        """))
        
        tables = tables_result.fetchall()
        
        if tables:
            print(f"Found {len(tables)} tables in test_berlin_data schema:")
            for table in tables:
                print(f"   📋 {table[0]} ({table[1]})")
        else:
            print("❌ No tables found in test_berlin_data schema")
        
        # 3. Check specifically for neighborhood table structure
        print("\n🏘️ 3. NEIGHBORHOOD TABLE ANALYSIS")
        print("-" * 40)
        
        try:
            # Check if neighborhood table exists and its structure
            neighborhood_cols = conn.execute(text("""
                SELECT column_name, data_type, is_nullable, column_default
                FROM information_schema.columns 
                WHERE table_schema = 'test_berlin_data' AND table_name = 'neighborhood'
                ORDER BY ordinal_position
            """)).fetchall()
            
            if neighborhood_cols:
                print("✅ neighborhood table found! Structure:")
                for col in neighborhood_cols:
                    nullable = "NULL" if col[2] == "YES" else "NOT NULL"
                    default = f" DEFAULT {col[3]}" if col[3] else ""
                    print(f"      • {col[0]}: {col[1]} {nullable}{default}")
                
                # Check data count
                row_count = conn.execute(text("""
                    SELECT COUNT(*) FROM test_berlin_data.neighborhood
                """)).scalar()
                print(f"\n   📊 Records in neighborhood table: {row_count}")
                
                # Show sample data
                if row_count > 0:
                    sample_data = conn.execute(text("""
                        SELECT * FROM test_berlin_data.neighborhood LIMIT 3
                    """)).fetchall()
                    print("   📋 Sample data:")
                    for row in sample_data:
                        print(f"      {dict(row._mapping)}")
                        
            else:
                print("❌ neighborhood table not found in test_berlin_data schema")
                
        except Exception as e:
            print(f"⚠️ Error checking neighborhood table: {str(e)[:60]}...")
        
        # 4. Check foreign key relationships
        print("\n🔗 4. FOREIGN KEY RELATIONSHIPS")
        print("-" * 40)
        
        fk_result = conn.execute(text("""
            SELECT 
                tc.table_name as child_table,
                kcu.column_name as child_column,
                ccu.table_name AS parent_table,
                ccu.column_name AS parent_column,
                rc.delete_rule,
                rc.update_rule
            FROM information_schema.table_constraints AS tc 
            JOIN information_schema.key_column_usage AS kcu
                ON tc.constraint_name = kcu.constraint_name
                AND tc.table_schema = kcu.table_schema
            JOIN information_schema.constraint_column_usage AS ccu
                ON ccu.constraint_name = tc.constraint_name
                AND ccu.table_schema = tc.table_schema
            JOIN information_schema.referential_constraints AS rc
                ON tc.constraint_name = rc.constraint_name
                AND tc.table_schema = rc.constraint_schema
            WHERE tc.constraint_type = 'FOREIGN KEY'
                AND tc.table_schema = 'test_berlin_data'
            ORDER BY tc.table_name
        """))
        
        foreign_keys = fk_result.fetchall()
        
        if foreign_keys:
            print(f"Found {len(foreign_keys)} foreign key relationships:")
            for fk in foreign_keys:
                print(f"   🔗 {fk[0]}.{fk[1]} → {fk[2]}.{fk[3]} (DEL: {fk[4]}, UPD: {fk[5]})")
        else:
            print("❌ No foreign key relationships found")
        
        # 5. Summary and recommendations
        print("\n💡 5. RECOMMENDATIONS")
        print("-" * 40)
        
        if schema_exists and tables:
            print("✅ SCHEMA EXISTS - We should work with existing structure!")
            print("📋 Options:")
            print("   A) Use existing neighborhood table (check if districts/neighborhoods match)")
            print("   B) Add data to existing tables if they're empty") 
            print("   C) Coordinate with team about existing structure")
            print("\n🎯 NEXT STEP: Check if existing neighborhood data matches your cleaned data")
        else:
            print("⚠️ Schema or tables missing - may need to create them")
            print("💡 Consider creating test_berlin_data schema if it doesn't exist")

except Exception as e:
    print(f"❌ Error investigating schema: {e}")
    print("💡 This might indicate connection issues or permission problems")

print(f"\n🔍 SCHEMA INVESTIGATION COMPLETE!")
print(f"💬 Let me know what you'd like to do with the existing structure!")

🔍 INVESTIGATING EXISTING test_berlin_data SCHEMA

📊 1. CHECKING SCHEMA EXISTENCE
----------------------------------------
✅ test_berlin_data schema EXISTS!

📋 2. TABLES IN test_berlin_data SCHEMA
----------------------------------------
Found 16 tables in test_berlin_data schema:
   📋 crime_statistics (BASE TABLE)
   📋 districts (BASE TABLE)
   📋 green_spaces_kai (BASE TABLE)
   📋 hospitals (BASE TABLE)
   📋 land_prices (BASE TABLE)
   📋 long_term_rentals (BASE TABLE)
   📋 neighborhood (BASE TABLE)
   📋 neighborhood_pop_stat (BASE TABLE)
   📋 neighborhoods (BASE TABLE)
   📋 playgrounds (BASE TABLE)
   📋 regional_statistics (BASE TABLE)
   📋 rent_stats_per_neighborhood (BASE TABLE)
   📋 rent_stats_per_street (BASE TABLE)
   📋 rent_stats_per_street_kai (BASE TABLE)
   📋 short_time_listings (BASE TABLE)
   📋 ubahn (BASE TABLE)

🏘️ 3. NEIGHBORHOOD TABLE ANALYSIS
----------------------------------------
✅ neighborhood table found! Structure:
      • neighborhood: character varying NOT NULL


In [9]:
# 🎯 FOCUSED CHECK: Key info about test_berlin_data schema

print("🎯 FOCUSED SCHEMA CHECK")
print("=" * 30)

try:
    with engine.connect() as conn:
        # Quick check: Does schema exist and what tables?
        schema_check = conn.execute(text("""
            SELECT COUNT(*) as table_count
            FROM information_schema.tables 
            WHERE table_schema = 'test_berlin_data'
        """)).scalar()
        
        print(f"📊 Tables in test_berlin_data schema: {schema_check}")
        
        if schema_check > 0:
            # List table names only
            table_names = conn.execute(text("""
                SELECT table_name
                FROM information_schema.tables 
                WHERE table_schema = 'test_berlin_data'
                ORDER BY table_name
            """)).fetchall()
            
            print("📋 Table names:")
            for table in table_names:
                print(f"   • {table[0]}")
            
            # Check neighborhood table specifically
            print(f"\n🏘️ NEIGHBORHOOD TABLE CHECK:")
            try:
                neighborhood_info = conn.execute(text("""
                    SELECT 
                        COUNT(*) as row_count,
                        COUNT(DISTINCT CASE WHEN column_name LIKE '%district%' THEN column_name END) as district_cols,
                        COUNT(DISTINCT CASE WHEN column_name LIKE '%geometry%' THEN column_name END) as geometry_cols
                    FROM test_berlin_data.neighborhood 
                    CROSS JOIN information_schema.columns 
                    WHERE table_schema = 'test_berlin_data' AND table_name = 'neighborhood'
                """)).fetchone()
                
                print(f"   📊 Rows: {neighborhood_info[0]}")
                print(f"   🏛️ District columns: {neighborhood_info[1]}")  
                print(f"   🗺️ Geometry columns: {neighborhood_info[2]}")
                
                # Show column structure
                cols = conn.execute(text("""
                    SELECT column_name, data_type
                    FROM information_schema.columns 
                    WHERE table_schema = 'test_berlin_data' AND table_name = 'neighborhood'
                    ORDER BY ordinal_position
                """)).fetchall()
                
                print(f"   📋 Columns: {[col[0] for col in cols]}")
                
            except Exception as e:
                print(f"   ❌ Neighborhood table issue: {str(e)[:50]}...")
        
        else:
            print("❌ No tables found in test_berlin_data schema")
            print("💡 The schema might not exist or be empty")

except Exception as e:
    print(f"❌ Error: {e}")

print(f"\n💡 CONCLUSION:")
if schema_check > 0:
    print("✅ test_berlin_data schema EXISTS with tables!")
    print("🎯 We should work with the existing structure")
    print("📋 Next: Check if we need to populate or modify existing tables")
else:
    print("⚠️ Schema appears to be missing or empty")
    print("🎯 We may need to create the schema structure")

🎯 FOCUSED SCHEMA CHECK
📊 Tables in test_berlin_data schema: 16
📋 Table names:
   • crime_statistics
   • districts
   • green_spaces_kai
   • hospitals
   • land_prices
   • long_term_rentals
   • neighborhood
   • neighborhood_pop_stat
   • neighborhoods
   • playgrounds
   • regional_statistics
   • rent_stats_per_neighborhood
   • rent_stats_per_street
   • rent_stats_per_street_kai
   • short_time_listings
   • ubahn

🏘️ NEIGHBORHOOD TABLE CHECK:
   📊 Rows: 39
   🏛️ District columns: 0
   🗺️ Geometry columns: 2
   📋 Columns: ['neighborhood', 'geometry', 'geometry_str']

💡 CONCLUSION:
✅ test_berlin_data schema EXISTS with tables!
🎯 We should work with the existing structure
📋 Next: Check if we need to populate or modify existing tables
