##Day 3 – SQL via Python: NYC School Data Exploration

## Step 1: Import Libraries 

In [9]:
!pip install psycopg2-binary



In [10]:
import pandas as pd
import psycopg2

## Step 2: Connect to the Database

In [11]:
# DB connection setup using hardcoded credentials (for onboarding only)
conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="npg_CeS9fJg2azZD",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)
cur = conn.cursor()


In [12]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

# 🔑 Replace with your real credentials from day3_sql_combined_with_creds.ipynb
db_user = "your_username"
db_pass = "your_password"
db_host = "localhost"   # or the host in your creds file
db_port = "5432"
db_name = "training"

# SQLAlchemy connection string
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}")

In [13]:
from sqlalchemy import create_engine
import pandas as pd

# build engine string
db_user = "neondb_owner"
db_pass = "npg_CeS9fJg2azZD"
db_host = "ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech"
db_port = "5432"
db_name = "neondb"

engine = create_engine(
    f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}?sslmode=require"
)

# test query
query = "SELECT * FROM nyc_schools.high_school_directory LIMIT 5;"
df = pd.read_sql(query, engine)
df.head()


Unnamed: 0,dbn,school_name,borough,building_code,phone_number,fax_number,grade_span_min,grade_span_max,expgrade_span_min,expgrade_span_max,...,number_programs,Location 1,Community Board,Council District,Census Tract,Zip Codes,Community Districts,Borough Boundaries,City Council Districts,Police Precincts
0,27Q260,Frederick Douglass Academy VI High School,Queens,Q465,718-471-2154,718-471-2890,9.0,12,,,...,1,"{'latitude': '40.601989336', 'longitude': '-73...",14,31,100802,20529,51,3,47,59
1,21K559,Life Academy High School for Film and Music,Brooklyn,K400,718-333-7750,718-333-7775,9.0,12,,,...,1,"{'latitude': '40.593593811', 'longitude': '-73...",13,47,306,17616,21,2,45,35
2,16K393,Frederick Douglass Academy IV Secondary School,Brooklyn,K026,718-574-2820,718-574-2821,9.0,12,,,...,1,"{'latitude': '40.692133704', 'longitude': '-73...",3,36,291,18181,69,2,49,52
3,08X305,Pablo Neruda Academy,Bronx,X450,718-824-1682,718-824-1663,9.0,12,,,...,1,"{'latitude': '40.822303765', 'longitude': '-73...",9,18,16,11611,58,5,31,26
4,03M485,Fiorello H. LaGuardia High School of Music & A...,Manhattan,M485,212-496-0700,212-724-5748,9.0,12,,,...,6,"{'latitude': '40.773670507', 'longitude': '-73...",7,6,151,12420,20,4,19,12


# 1. School distribution by borough

In [14]:
query = """
SELECT borough, COUNT(DISTINCT dbn) AS total_schools
FROM nyc_schools.high_school_directory
GROUP BY borough
ORDER BY total_schools DESC;
"""
df_schools = pd.read_sql(query, engine)
df_schools

Unnamed: 0,borough,total_schools
0,Brooklyn,121
1,Bronx,118
2,Manhattan,106
3,Queens,80
4,Staten Island,10


# 2. Average % of English Language Learners (ELL) per borough

In [15]:
query = """
SELECT h.borough, ROUND(AVG(d.ell_percent)::numeric, 2) AS avg_ell_percent
FROM nyc_schools.school_demographics d
JOIN nyc_schools.high_school_directory h
    ON d.dbn = h.dbn
GROUP BY h.borough
ORDER BY avg_ell_percent DESC;
"""
df_ell = pd.read_sql(query, engine)
df_ell

Unnamed: 0,borough,avg_ell_percent
0,Manhattan,7.57


In [16]:
query = """
SELECT h.borough, h.school_name, d.sped_percent
FROM nyc_schools.school_demographics d
JOIN nyc_schools.high_school_directory h
    ON d.dbn = h.dbn
WHERE d.sped_percent IS NOT NULL
ORDER BY h.borough, d.sped_percent DESC
LIMIT 15;  -- 3 per borough × 5 boroughs
"""
df_sped = pd.read_sql(query, engine)
df_sped

Unnamed: 0,borough,school_name,sped_percent
0,Manhattan,East Side Community School,28.8
1,Manhattan,East Side Community School,27.7
2,Manhattan,East Side Community School,26.7
3,Manhattan,East Side Community School,26.4
4,Manhattan,Marta Valle High School,25.9
5,Manhattan,East Side Community School,25.1
6,Manhattan,Henry Street School for International Studies,25.1
7,Manhattan,Henry Street School for International Studies,24.9
8,Manhattan,East Side Community School,24.8
9,Manhattan,East Side Community School,24.5


# 3 Top-3 schools per borough by special-education share

In [17]:
query = """
WITH ranked_schools AS (
    SELECT 
        h.borough,
        h.school_name,
        d.sped_percent,
        ROW_NUMBER() OVER (
            PARTITION BY h.borough
            ORDER BY d.sped_percent DESC
        ) AS rank
    FROM nyc_schools.school_demographics d
    JOIN nyc_schools.high_school_directory h
        ON d.dbn = h.dbn
    WHERE d.sped_percent IS NOT NULL
)
SELECT borough, school_name, sped_percent
FROM ranked_schools
WHERE rank <= 3
ORDER BY borough, rank;
"""
df_sped = pd.read_sql(query, engine)
df_sped

Unnamed: 0,borough,school_name,sped_percent
0,Manhattan,East Side Community School,28.8
1,Manhattan,East Side Community School,27.7
2,Manhattan,East Side Community School,26.7


# Key Insights:
- Brooklyn, the Bronx, and Manhattan offer a wider range of school resources and student opportunities, whereas Staten Island provides comparatively fewer options.
- The dataset only records ell_percent for Manhattan, making borough comparisons impossible and highlighting the need to check data completeness before analysis.