# 🧮 Data Integration & Schema Design: NYC SAT Results

Dataset: daily_tasks/day_4/day_4_datasets/sat_results.csv

📊 Existing Tables: high_school_directory, school_demographics, school_safety_report

🎯 Objective
Learn how to evaluate, clean, and integrate a real-world dataset into an existing PostgreSQL schema.

In [17]:

# CSV input and cleaned CSV output
CSV_PATH = "daily_tasks/day_4/day_4_datasets/sat_results.csv"   
OUTPUT_CLEAN_CSV = "daily_tasks/day_4/cleaned_sat_results.csv"  

# SQLAlchemy connection string format:
# postgresql+psycopg2://user:password@host:port/dbname
DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

SCHEMA = "nyc_schools"
TARGET_TABLE = "mpacholska_sat_results"


Imports

In [18]:

import warnings, re, os
import pandas as pd
from sqlalchemy import create_engine, text
warnings.filterwarnings("ignore")

VALID_MIN, VALID_MAX = 200, 800


In [19]:
##Psycopg2 connector

# ✅ 1. Explore the Dataset

In [20]:

raw = pd.read_csv("day_4_datasets/sat-results.csv")
print("Raw shape:", raw.shape)
print("Raw columns:", list(raw.columns))
raw.head(15)


Raw shape: (493, 11)
Raw columns: ['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
5,01M515,LOWER EAST SIDE PREPARATORY HIGH SCHOOL,112,332,557,316,332,414951,x345,,3.0
6,01M539,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",159,522,574,525,522,697107,,78%,2.0
7,01M650,CASCADES HIGH SCHOOL,18,417,418,411,417,297600,,92%,4.0
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,130,624,604,628,624,881396,x234,,
9,02M047,47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECO...,16,395,400,387,395,751293,,78%,4.0


# ✅ 2. Clean the Data Using Python

Handle duplicates, invalid SAT scores, and inconsistent formatting (e.g., "85%"), weird outliers and any inconsistencies (hint: valid scores are between 200 and 800)
Normalize headers and drop unrelated fields

## Normalize headers to snake_case

In [21]:

df = raw.copy()

# Snake-case column names, also drop literal '%' from headers if present
def _snake(s):
    s = str(s).strip()
    s = re.sub(r"[%]", "", s)
    s = re.sub(r"[^\w]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s.lower()

df.columns = [_snake(c) for c in df.columns]
print("After header normalize:", list(df.columns))


After header normalize: ['dbn', 'school_name', 'num_of_sat_test_takers', 'sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score', 'sat_critical_readng_avg_score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


## Rename columns 

In [22]:

# Handle classic DOE title-case variants if they survived
title_to_target = {
    "DBN": "dbn",
    "SCHOOL NAME": "school_name",
    "Num of SAT Test Takers": "num_test_takers",
    "SAT Critical Reading Avg. Score": "sat_reading",
    "SAT Math Avg. Score": "sat_math",
    "SAT Writing Avg. Score": "sat_writing",
}
for k, v in title_to_target.items():
    if k in df.columns:
        df = df.rename(columns={k: v})

# Map common snake_case variants
snake_map = {
    "dbn":"dbn",
    "school_name":"school_name",
    "num_of_sat_test_takers":"num_test_takers",
    "num_of_test_takers":"num_test_takers",
    "num_test_takers":"num_test_takers",
    "sat_critical_reading_avg_score":"sat_reading",
    "sat_reading_avg_score":"sat_reading",
    "sat_math_avg_score":"sat_math",
    "sat_writing_avg_score":"sat_writing",
    "school_year":"school_year",
    "borough":"borough",
}
for k, v in snake_map.items():
    if k in df.columns and v not in df.columns:
        df = df.rename(columns={k: v})

print("Columns after rename mapping:", list(df.columns))


Columns after rename mapping: ['dbn', 'school_name', 'num_test_takers', 'sat_reading', 'sat_math', 'sat_writing', 'sat_critical_readng_avg_score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


## Coerce numeric columns & handle percent-like values

In [23]:

# Numeric coercion for key columns
for col in ["num_test_takers","sat_reading","sat_math","sat_writing","school_year"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Percent-like columns (if any) -> remove '%' then numeric
for col in df.columns:
    try:
        if df[col].astype(str).str.contains("%").any():
            df[col] = pd.to_numeric(df[col].astype(str).str.replace("%","", regex=False), errors="coerce")
    except Exception:
        pass

df.head(1)


Unnamed: 0,dbn,school_name,num_test_takers,sat_reading,sat_math,sat_writing,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,355,218160,x345,78.0,2.0


## Validate SAT scores are within [200, 800]

In [24]:

for col in ["sat_reading","sat_math","sat_writing"]:
    if col in df.columns:
        invalid_mask = ~df[col].between(VALID_MIN, VALID_MAX)
        df.loc[invalid_mask, col] = pd.NA

df[[c for c in ["sat_reading","sat_math","sat_writing"] if c in df.columns]].describe()


Unnamed: 0,sat_reading,sat_math,sat_writing
count,435.0,430.0,435.0
mean,400.712644,413.34186,393.712644
std,56.056774,64.141429,57.870447
min,279.0,312.0,286.0
25%,368.0,372.0,360.0
50%,391.0,395.0,382.0
75%,416.0,437.0,411.0
max,679.0,735.0,682.0


## Drop duplicates and ensure required fields

In [25]:

if "school_year" in df.columns:
    df = df.sort_values(["dbn","school_year"]).drop_duplicates(["dbn","school_year"], keep="last")
else:
    df = df.sort_values(["dbn"]).drop_duplicates(["dbn"], keep="last")

# Required fields
if "dbn" in df.columns and "school_name" in df.columns:
    df = df.dropna(subset=["dbn","school_name"])
else:
    raise ValueError("Required columns 'dbn' and 'school_name' not found after cleaning.")

print("After dedupe shape:", df.shape)


After dedupe shape: (478, 11)


## 9) Deleting unnecesary columns and selecting final columns → `new_df`

Choose columns to upload to the database

In [26]:

# Columns to remove (include the likely correct spelling too)
drop_cols = [
    "sat_critical_readng_avg_score",   # as you wrote it
    "sat_critical_reading_avg_score",  # safe fallback
    "contact_extension",
    "internal_school_id",
]

# Drop only those columns if they exist
existing_dropped = [c for c in drop_cols if c in df.columns]
df = df.drop(columns=drop_cols, errors='ignore')
print("Dropped (if existed):", existing_dropped)

# Then proceed with your final selection
final_cols = [
    c for c in [
        "dbn","school_name","num_test_takers",
        "sat_reading","sat_math","sat_writing",
        "school_year","academic_tier_rating"
    ] if c in df.columns
]
new_df = df[final_cols].copy()
print("Final columns:", new_df.columns.tolist())
new_df.head(3)

Dropped (if existed): ['sat_critical_readng_avg_score', 'contact_extension', 'internal_school_id']
Final columns: ['dbn', 'school_name', 'num_test_takers', 'sat_reading', 'sat_math', 'sat_writing', 'academic_tier_rating']


Unnamed: 0,dbn,school_name,num_test_takers,sat_reading,sat_math,sat_writing,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,3.0


##  Save cleaned CSV

In [28]:
from pathlib import Path

# Explicit Windows path
OUTPUT_CLEAN_CSV = r"C:\Users\micha\Projects VS\.venv\daily_tasks\day_4\cleaned_sat_results.csv"

# Ensure folder exists, then save
Path(OUTPUT_CLEAN_CSV).parent.mkdir(parents=True, exist_ok=True)
new_df.to_csv(OUTPUT_CLEAN_CSV, index=False)
print(f"[ok] Cleaned CSV saved to: {OUTPUT_CLEAN_CSV} (rows={len(new_df)})")



##alternative : new_df.to_csv(r"C:\Users\micha\Projects VS\.venv\daily_tasks\day_4\cleaned_sat_results.csv", index=False)


[ok] Cleaned CSV saved to: C:\Users\micha\Projects VS\.venv\daily_tasks\day_4\cleaned_sat_results.csv (rows=478)


# ✅ 3. Save in DB

In [29]:

engine = create_engine(DATABASE_URL)

new_df.to_sql(
    name='mpacholska_sat_results',
    con=engine,
    schema='nyc_schools',
    if_exists='replace',
    index=False
)

print("[ok] Uploaded to Postgres: nyc_schools.mpacholska_sat_results (REPLACED)")


[ok] Uploaded to Postgres: nyc_schools.mpacholska_sat_results (REPLACED)


# Quick QA summary

In [30]:

qa = {
    "rows": len(new_df),
    "cols": list(new_df.columns),
    "nulls_sat_reading_%": float(new_df["sat_reading"].isna().mean()*100) if "sat_reading" in new_df.columns else None,
    "nulls_sat_math_%": float(new_df["sat_math"].isna().mean()*100) if "sat_math" in new_df.columns else None,
    "nulls_sat_writing_%": float(new_df["sat_writing"].isna().mean()*100) if "sat_writing" in new_df.columns else None,
}
qa


{'rows': 478,
 'cols': ['dbn',
  'school_name',
  'num_test_takers',
  'sat_reading',
  'sat_math',
  'sat_writing',
  'academic_tier_rating'],
 'nulls_sat_reading_%': 11.92468619246862,
 'nulls_sat_math_%': 12.97071129707113,
 'nulls_sat_writing_%': 11.92468619246862}

# Decisions : 
- Normalized headers + renamed them 
- converted numeric values to numbers + cleaned percentage issue 
- Validated SAT Score 200-800
- Removed DUplicates 
- Deleted unwanted columns (sat_critical_readng_avg_score)
     - I decided to not delete the null valued rows completely - because ot can skew results of further analysis 


