# Day 4 — SAT Results ETL (Explore → Clean → Load)

**Objective:** Evaluate, clean, and integrate SAT results into existing PostgreSQL schema.
- Inspect dataset, identify relational keys
- Clean inconsistencies (200–800 SAT range, percentages, dups)
- Normalize headers & drop unrelated fields
- Append cleaned data to `[your-name]_sat_scores` using parameterized inserts

### 1. Explore the Dataset

In [21]:
import os
import pandas as pd
import numpy as np

import os
os.chdir("/Users/oleksandraprotsenko/DOC/DA_projects/Day4/day_4_task")
print("CWD:", os.getcwd())

BASE_DIR = "/Users/oleksandraprotsenko/DOC/DA_projects/Day4"

RAW_PATH = os.path.join(BASE_DIR, "sat-results.csv")

OUT_DIR = os.path.join(BASE_DIR, "day_4_task")
os.makedirs(OUT_DIR, exist_ok=True)

OUT_CSV = os.path.join(OUT_DIR, "cleaned_sat_results.csv")

df = pd.read_csv(RAW_PATH)
print("Shape:", df.shape)
df.head(10)

CWD: /Users/oleksandraprotsenko/DOC/DA_projects/Day4/day_4_task
Shape: (493, 11)


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
5,01M515,LOWER EAST SIDE PREPARATORY HIGH SCHOOL,112,332,557,316,332,414951,x345,,3.0
6,01M539,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",159,522,574,525,522,697107,,78%,2.0
7,01M650,CASCADES HIGH SCHOOL,18,417,418,411,417,297600,,92%,4.0
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,130,624,604,628,624,881396,x234,,
9,02M047,47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECO...,16,395,400,387,395,751293,,78%,4.0


### 2. Cleaning and normalization

In [15]:
# Rename columns to snake_case
rename_map = {
    "DBN": "dbn",
    "SCHOOL NAME": "school_name",
    "Num of SAT Test Takers": "num_test_takers",
    "SAT Critical Reading Avg. Score": "sat_reading_avg",
    "SAT Math Avg. Score": "sat_math_avg",
    "SAT Writing Avg. Score": "sat_writing_avg",
    "pct_students_tested": "pct_students_tested",
    "academic_tier_rating": "academic_tier_rating"
}
df = df.rename(columns=rename_map)

# Drop redundant or synthetic columns
drop_cols = [c for c in ["SAT Critical Readng Avg. Score", "internal_school_id", "contact_extension"] if c in df.columns]
df = df.drop(columns=drop_cols, errors="ignore")

# Keep only useful columns
useful_cols = [
    "dbn", "school_name", "num_test_takers",
    "sat_reading_avg", "sat_math_avg", "sat_writing_avg",
    "pct_students_tested", "academic_tier_rating"
]
df = df[useful_cols]

# Helper function to convert to numeric
def to_num(series):
    return pd.to_numeric(series, errors="coerce")

# Convert numeric columns
df["num_test_takers"] = to_num(df["num_test_takers"])
for c in ["sat_reading_avg", "sat_math_avg", "sat_writing_avg", "academic_tier_rating"]:
    df[c] = to_num(df[c])

# Inconsistent formatting (e.g., "85%")
df["pct_students_tested"] = (
    df["pct_students_tested"].astype(str).str.strip().str.replace("%","", regex=False)
)
df["pct_students_tested"] = to_num(df["pct_students_tested"])

# Invalid SAT scores: Validate SAT ranges (200–800), set invalid values to NaN
for c in ["sat_reading_avg", "sat_math_avg", "sat_writing_avg"]:
    mask = df[c].between(200, 800, inclusive="both")
    df.loc[~mask, c] = np.nan

# Handle duplicates
df = df.drop_duplicates().reset_index(drop=True)

print("After cleaning:", df.shape)
df.head(10)


After cleaning: (478, 8)


Unnamed: 0,dbn,school_name,num_test_takers,sat_reading_avg,sat_math_avg,sat_writing_avg,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,78.0,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,92.0,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,92.0,2.0
5,01M515,LOWER EAST SIDE PREPARATORY HIGH SCHOOL,112.0,332.0,557.0,316.0,,3.0
6,01M539,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",159.0,522.0,574.0,525.0,78.0,2.0
7,01M650,CASCADES HIGH SCHOOL,18.0,417.0,418.0,411.0,92.0,4.0
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,130.0,624.0,604.0,628.0,,
9,02M047,47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECO...,16.0,395.0,400.0,387.0,78.0,4.0


### 3. Quality checks (missing values, basic stats)

In [7]:
print("Row count:", len(df))
print("\nMissing values per column:")
print(df.isna().sum())

print("\nDescriptive statistics (numeric):")
display(df.describe())

print("\nSample rows:")
display(df.sample(5, random_state=42))

Row count: 478

Missing values per column:
dbn                       0
school_name               0
num_test_takers          57
sat_reading_avg          57
sat_math_avg             62
sat_writing_avg          57
pct_students_tested     115
academic_tier_rating     86
dtype: int64

Descriptive statistics (numeric):


Unnamed: 0,num_test_takers,sat_reading_avg,sat_math_avg,sat_writing_avg,pct_students_tested,academic_tier_rating
count,421.0,421.0,416.0,421.0,363.0,392.0
mean,110.320665,400.850356,413.733173,393.985748,84.595041,2.579082
std,155.534254,56.802783,64.945638,58.635109,5.673305,1.128053
min,6.0,279.0,312.0,286.0,78.0,1.0
25%,41.0,368.0,372.0,360.0,78.0,2.0
50%,62.0,391.0,395.0,381.0,85.0,3.0
75%,95.0,416.0,437.25,411.0,92.0,4.0
max,1277.0,679.0,735.0,682.0,92.0,4.0



Sample rows:


Unnamed: 0,dbn,school_name,num_test_takers,sat_reading_avg,sat_math_avg,sat_writing_avg,pct_students_tested,academic_tier_rating
469,75M035,P.S. 035,,,,,,4.0
33,02M416,ELEANOR ROOSEVELT HIGH SCHOOL,127.0,572.0,594.0,592.0,92.0,2.0
131,07X548,URBAN ASSEMBLY SCHOOL FOR CAREERS IN SPORTS,44.0,387.0,411.0,383.0,,2.0
72,02M630,ART AND DESIGN HIGH SCHOOL,270.0,444.0,441.0,430.0,85.0,
78,03M403,THE GLOBAL LEARNING COLLABORATIVE,,,,,78.0,1.0


### 4. Save cleaned dataset

In [8]:
BASE_DIR = "/Users/oleksandraprotsenko/DOC/DA_projects/Day4"
OUT_DIR = os.path.join(BASE_DIR, "day_4_task")
os.makedirs(OUT_DIR, exist_ok=True)
OUT_CSV = os.path.join(OUT_DIR, "cleaned_sat_results.csv")

df.to_csv(OUT_CSV, index=False)
OUT_CSV

'/Users/oleksandraprotsenko/DOC/DA_projects/Day4/day_4_task/cleaned_sat_results.csv'

### 🔌 Load cleaned CSV into PostgreSQL

In [20]:
# 🔌 Load cleaned CSV into PostgreSQL (robust, no NAType / overflow)

import os
import numpy as np
import pandas as pd
import psycopg2
from psycopg2.extras import execute_values

# --- Paths ---
CLEAN_PATH = "/Users/oleksandraprotsenko/DOC/DA_projects/Day4/day_4_task/cleaned_sat_results.csv"

# --- Read cleaned data ---
df = pd.read_csv(CLEAN_PATH)

# --- Final type safety before load ---
# numeric ints as pandas nullable Int64 (we'll convert to Python None later)
int_cols = ["num_test_takers", "sat_reading_avg", "sat_math_avg", "sat_writing_avg"]
for c in int_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").round().astype("Int64")

# percents & ratings as floats
df["pct_students_tested"]  = pd.to_numeric(df["pct_students_tested"],  errors="coerce").astype(float)
df["academic_tier_rating"] = pd.to_numeric(df["academic_tier_rating"], errors="coerce").astype(float)

# (Optional sanity) clamp obviously wrong values
df.loc[df["num_test_takers"] < 0, "num_test_takers"] = pd.NA
df.loc[df["num_test_takers"] > 1_000_000, "num_test_takers"] = pd.NA
for c in ["sat_reading_avg","sat_math_avg","sat_writing_avg"]:
    m = df[c].between(200, 800, inclusive="both")
    df.loc[~m, c] = pd.NA
df.loc[df["pct_students_tested"].notna() & ((df["pct_students_tested"] < 0) | (df["pct_students_tested"] > 100)), "pct_students_tested"] = np.nan

# --- Convert pandas <NA>/NaN to Python None (so psycopg2 can adapt) ---
df_sql = df.astype(object).where(pd.notnull(df), None)

cols = ["dbn","school_name","num_test_takers","sat_reading_avg","sat_math_avg",
        "sat_writing_avg","pct_students_tested","academic_tier_rating"]
rows = [tuple(df_sql.loc[i, cols]) for i in range(len(df_sql))]

print("Prepared rows:", len(rows))

# --- PostgreSQL connection (your Day 3 creds) ---
conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="npg_CeS9fJg2azZD",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)
cur = conn.cursor()

table_name = "alexandra_sat_scores"

# --- Recreate table with safe types (prevents overflow/type mismatch) ---
cur.execute(f"DROP TABLE IF EXISTS {table_name};")
cur.execute(f"""
CREATE TABLE {table_name} (
    dbn TEXT PRIMARY KEY,
    school_name TEXT,
    num_test_takers INTEGER,
    sat_reading_avg INTEGER,
    sat_math_avg INTEGER,
    sat_writing_avg INTEGER,
    pct_students_tested REAL,
    academic_tier_rating REAL,
    CHECK (sat_reading_avg IS NULL OR (sat_reading_avg BETWEEN 200 AND 800)),
    CHECK (sat_math_avg    IS NULL OR (sat_math_avg    BETWEEN 200 AND 800)),
    CHECK (sat_writing_avg IS NULL OR (sat_writing_avg BETWEEN 200 AND 800))
);
""")
conn.commit()
print("Table recreated.")

# --- UPSERT (parameterized bulk insert) ---
insert_sql = f"""
INSERT INTO {table_name} ({", ".join(cols)})
VALUES %s
ON CONFLICT (dbn) DO UPDATE SET
    school_name = EXCLUDED.school_name,
    num_test_takers = EXCLUDED.num_test_takers,
    sat_reading_avg = EXCLUDED.sat_reading_avg,
    sat_math_avg = EXCLUDED.sat_math_avg,
    sat_writing_avg = EXCLUDED.sat_writing_avg,
    pct_students_tested = EXCLUDED.pct_students_tested,
    academic_tier_rating = EXCLUDED.academic_tier_rating;
"""

# small test then full load
execute_values(cur, insert_sql, rows[:10])
conn.commit()
print("Test batch OK.")

execute_values(cur, insert_sql, rows, page_size=1000)
conn.commit()
print(f"Upserted {len(rows)} rows into {table_name}.")

# --- Quick DB check ---
cur.execute(f"SELECT COUNT(*) FROM {table_name};")
print("Row count in DB:", cur.fetchone()[0])

cur.close()
conn.close()


Prepared rows: 478
Table recreated.
Test batch OK.
Upserted 478 rows into alexandra_sat_scores.
Row count in DB: 478
