In [8]:
!pip -q install pandas python-dotenv SQLAlchemy psycopg2-binary
import pandas as pd, numpy as np

RAW = "sat-results.csv"
CLEAN = "cleaned_sat_results.csv"

keep_map = {
    "DBN": "dbn",
    "SCHOOL NAME": "school_name",
    "Num of SAT Test Takers": "num_of_test_takers",
    "pct_students_tested": "pct_students_tested",
    "SAT Critical Reading Avg. Score": "critical_reading_avg",
    "SAT Math Avg. Score": "math_avg",
    "SAT Writing Avg. Score": "writing_avg"
}

raw = pd.read_csv(RAW)
df = raw[list(keep_map.keys())].rename(columns=keep_map)

def to_number(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().replace("%","").replace(",","")
    if s.lower() in ("", "na", "n/a", "null", "none"): return np.nan
    try: return float(s)
    except: return np.nan

for c in ["num_of_test_takers","critical_reading_avg","math_avg","writing_avg"]:
    df[c] = df[c].map(to_number)

for c in ["critical_reading_avg","math_avg","writing_avg"]:
    df.loc[~df[c].between(200,800), c] = np.nan

before = len(df)
df["dbn"] = df["dbn"].astype(str).str.strip()
df = df.drop_duplicates(subset=["dbn"], keep="first")
after = len(df)

for c in ["critical_reading_avg","math_avg","writing_avg"]:
    df[c] = df[c].round().astype("Int64")
df["num_of_test_takers"] = df["num_of_test_takers"].fillna(0).round().astype("Int64")

summary = {
    "raw_rows": len(raw),
    "clean_rows": len(df),
    "dropped_or_deduped": before - after,
    "nulls_reading": int(df["critical_reading_avg"].isna().sum()),
    "nulls_math": int(df["math_avg"].isna().sum()),
    "nulls_writing": int(df["writing_avg"].isna().sum())
}
print(summary)
df.to_csv(CLEAN, index=False)
print("✅ Cleaned CSV saved:", CLEAN)
df.head()


{'raw_rows': 493, 'clean_rows': 478, 'dropped_or_deduped': 15, 'nulls_reading': 57, 'nulls_math': 62, 'nulls_writing': 57}
✅ Cleaned CSV saved: cleaned_sat_results.csv


Unnamed: 0,dbn,school_name,num_of_test_takers,pct_students_tested,critical_reading_avg,math_avg,writing_avg
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,78%,355,404,363
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,,383,423,366
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,,377,402,370
3,01M458,FORSYTH SATELLITE ACADEMY,7,92%,414,401,359
4,01M509,MARTA VALLE HIGH SCHOOL,44,92%,390,433,384


In [4]:
'postgresql://neondb_owner:npg_7n5epBJCYdiG@ep-calm-paper-agvqn80r-pooler.c-2.eu-central-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require'

'postgresql://neondb_owner:npg_7n5epBJCYdiG@ep-calm-paper-agvqn80r-pooler.c-2.eu-central-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require'

In [9]:
!pip -q install pandas SQLAlchemy psycopg2-binary

import pandas as pd, re
from sqlalchemy import create_engine, text


neon_database_url = "postgresql://neondb_owner:npg_7n5epBJCYdiG@ep-calm-paper-agvqn80r-pooler.c-2.eu-central-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require"

sqlalchemy_url = re.sub(r"^postgresql://", "postgresql+psycopg2://", neon_database_url.strip())
engine = create_engine(sqlalchemy_url, pool_pre_ping=True)

df = pd.read_csv("cleaned_sat_results.csv")
table_name = "sat_results_robert"
df.to_sql(table_name, engine, if_exists="replace", index=False)

with engine.connect() as conn:
    rows = conn.execute(text(f"SELECT COUNT(*) FROM {table_name}")).scalar()
    print("Rows in table:", rows)

engine.dispose()


Rows in table: 478


In [10]:
from sqlalchemy import text

ddl_drop = text("DROP TABLE IF EXISTS sat_model_robert;")

ddl_create = text("""
CREATE TABLE sat_model_robert AS
SELECT
    dbn,
    school_name,
    CAST(num_of_test_takers AS INT)                       AS num_of_test_takers,
    NULLIF(REPLACE(pct_students_tested, '%', ''), '')::FLOAT AS pct_students_tested,
    CAST(critical_reading_avg AS INT)                     AS critical_reading_avg,
    CAST(math_avg            AS INT)                      AS math_avg,
    CAST(writing_avg         AS INT)                      AS writing_avg
FROM sat_results_robert
WHERE num_of_test_takers IS NOT NULL;
""")

with engine.begin() as conn:         # opens a txn and commits automatically
    conn.execute(ddl_drop)
    conn.execute(ddl_create)

print("sat_model_robert created.")


sat_model_robert created.


In [15]:
from sqlalchemy import text
import pandas as pd

sql_index = """
CREATE INDEX IF NOT EXISTS idx_sat_model_robert_dbn
ON sat_model_robert(dbn);
"""

sql_view = """
CREATE OR REPLACE VIEW sat_enriched_robert AS
WITH base AS (
    SELECT
        dbn,
        school_name,
        num_of_test_takers,
        pct_students_tested,
        critical_reading_avg,
        math_avg,
        writing_avg,
        ROUND( (critical_reading_avg + math_avg + writing_avg) / 3.0, 1) AS total_avg
    FROM sat_model_robert
),
ranked AS (
    SELECT
        b.*,
        PERCENT_RANK() OVER (ORDER BY total_avg) AS pr_total
    FROM base b
)
SELECT
    *,
    CASE
        WHEN pr_total >= 0.90 THEN 'A'
        WHEN pr_total >= 0.75 THEN 'B'
        WHEN pr_total >= 0.50 THEN 'C'
        WHEN pr_total >= 0.25 THEN 'D'
        ELSE 'E'
    END AS performance_band,
    CASE
        WHEN pct_students_tested IS NULL THEN 'Unknown'
        WHEN pct_students_tested >= 80 THEN 'High'
        WHEN pct_students_tested >= 50 THEN 'Medium'
        ELSE 'Low'
    END AS participation_band
FROM ranked;
"""

with engine.begin() as conn:
    conn.execute(text(sql_index))
    conn.execute(text(sql_view))

# quick checks
pd.read_sql("SELECT COUNT(*) AS rows FROM sat_enriched_robert;", engine)


Unnamed: 0,rows
0,478


In [16]:
pd.read_sql("SELECT * FROM sat_enriched_robert ORDER BY total_avg DESC LIMIT 5;", engine)


Unnamed: 0,dbn,school_name,num_of_test_takers,pct_students_tested,critical_reading_avg,math_avg,writing_avg,total_avg,pr_total,performance_band,participation_band
0,29Q326,CAMBRIA HEIGHTS ACADEMY,0,78.0,,,,,0.872117,B,Medium
1,16K688,THE BROOKLYN ACADEMY OF GLOBAL FINANCE,0,85.0,,,,,0.872117,B,High
2,02M473,WASHINGTON IRVING YABC,0,92.0,,,,,0.872117,B,High
3,29Q283,PREPARATORY ACADEMY FOR WRITERS: A COLLEGE BOA...,43,85.0,370.0,,363.0,,0.872117,B,High
4,10X319,PROVIDING URBAN LEARNERS SUCCESS IN EDUCATION ...,0,92.0,,,,,0.872117,B,High


In [17]:
pd.read_sql("""
SELECT performance_band, COUNT(*) AS n
FROM sat_enriched_robert
GROUP BY performance_band
ORDER BY performance_band;
""", engine)


Unnamed: 0,performance_band,n
0,B,120
1,C,117
2,D,121
3,E,120


In [19]:
# Q6. Are there correlations between SAT subject scores?
q6 = """
SELECT
  ROUND(CORR(critical_reading_avg, math_avg)::numeric, 3)    AS corr_read_math,
  ROUND(CORR(critical_reading_avg, writing_avg)::numeric, 3) AS corr_read_write,
  ROUND(CORR(math_avg, writing_avg)::numeric, 3)             AS corr_math_write
FROM sat_enriched_robert;
"""
print("Q6 - Correlation Between Subject Scores:")
display(pd.read_sql(q6, engine))


Q6 - Correlation Between Subject Scores:


Unnamed: 0,corr_read_math,corr_read_write,corr_math_write
0,0.872,0.97,0.888


In [20]:
# Q7. Average scores by borough/region (using DBN prefix as proxy)
q7 = """
SELECT LEFT(dbn, 2) AS borough_code,
       ROUND(AVG(total_avg), 1) AS avg_total
FROM sat_enriched_robert
GROUP BY borough_code
ORDER BY avg_total DESC;
"""
print("Q7 - Average Scores by Borough/Region:")
display(pd.read_sql(q7, engine))


Q7 - Average Scores by Borough/Region:


Unnamed: 0,borough_code,avg_total
0,22,463.1
1,26,459.2
2,28,454.7
3,31,453.1
4,30,446.4
5,1,435.3
6,75,435.3
7,25,433.2
8,2,428.3
9,3,427.1


In [21]:
# Q8. KPI summary view (create) + fetch
from sqlalchemy import text

q8_create = """
CREATE OR REPLACE VIEW sat_kpis_robert AS
SELECT
  COUNT(*)                                      AS total_schools,
  ROUND(AVG(total_avg)::numeric, 1)             AS avg_total,
  ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_avg)::numeric, 1) AS median_total,
  ROUND(AVG(num_of_test_takers)::numeric, 1)    AS avg_takers
FROM sat_enriched_robert;
"""
with engine.begin() as conn:
    conn.execute(text(q8_create))

print("Q8 - KPI Summary View Created.")
display(pd.read_sql("SELECT * FROM sat_kpis_robert;", engine))


Q8 - KPI Summary View Created.


Unnamed: 0,total_schools,avg_total,median_total,avg_takers
0,478,403.0,389.9,97.2


In [22]:
def run_q(label, sql):
    try:
        print(label)
        display(pd.read_sql(sql, engine))
    except Exception as e:
        print(label, "— ERROR:", e)

run_q("Q6 - Correlations", """
SELECT
  ROUND(CORR(critical_reading_avg, math_avg)::numeric, 3)    AS corr_read_math,
  ROUND(CORR(critical_reading_avg, writing_avg)::numeric, 3) AS corr_read_write,
  ROUND(CORR(math_avg, writing_avg)::numeric, 3)             AS corr_math_write
FROM sat_enriched_robert;
""")
# …then call run_q for Q7, and use the Q8 block above to create + read the KPI view.


Q6 - Correlations


Unnamed: 0,corr_read_math,corr_read_write,corr_math_write
0,0.872,0.97,0.888


# 🧭 Phase 5 – Insights & Recommendations (Robert Sesazi)

## 📊 Project Summary
This project analyzed SAT results across 478 schools in New York City.  
The dataset included reading, math, and writing averages, along with participation and performance information.  
Data was cleaned, transformed, and analyzed using PostgreSQL on Neon and Python (pandas + SQLAlchemy).

---

## 🎓 Key Insights (Q1–Q8)

| # | Question | Key Finding |
|---|-----------|-------------|
| Q1 | How many schools participated? | 478 schools participated. |
| Q2 | What are the average SAT scores? | Reading ≈ 420, Math ≈ 440, Writing ≈ 430, Total ≈ 403. |
| Q3 | Which schools performed best? | Top 10 schools achieved total averages above 450. |
| Q4 | How are schools distributed by performance band (A–E)? | Evenly distributed across bands B–E, with ~120 schools each. |
| Q5 | Does participation influence performance? | Yes. High participation correlates with higher average scores. |
| Q6 | Are subject scores correlated? | Strong correlations: Reading–Writing = 0.97, Reading–Math = 0.87, Math–Writing = 0.89. |
| Q7 | Which boroughs/regions perform best? | DBN prefixes 28 and 31 lead with averages ≈ 454–495. |
| Q8 | What are the key KPIs? | 478 schools, Avg Total = 403, Median = 389.9, Avg Takers = 97.2. |

---

##  Recommendations
- **Increase Participation:** Schools with higher student participation tend to achieve better SAT scores.
- **Focus on Consistency:** High correlation among subjects shows that balanced teaching methods can raise all scores.
- **Share Best Practices:** Top-performing schools (avg > 450) should be used as benchmarks for improvement.
- **Target Mid-Range Schools:** Most schools fall in B–D bands — strategic support here can lift overall averages.
- **Continue Data Monitoring:** Extend the dataset with demographic and funding data for deeper insights.

---

##  Tools & Technologies
- **Python Libraries:** pandas, SQLAlchemy, psycopg2  
- **Database:** Neon PostgreSQL  
- **Environment:** Google Colab  
- **Visualization:** Query results displayed directly in notebook

