## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## Load Dataset

In [2]:
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid', palette='muted')

In [3]:
file_path = "sat-results.csv"  # Update if in a different path
df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


## Cleaning and Normalisation

In [5]:
# Rename columns to snake_case
rename_map = {
    "DBN": "dbn",
    "SCHOOL NAME": "school_name",
    "Num of SAT Test Takers": "num_test_takers",
    "SAT Critical Reading Avg. Score": "sat_reading_avg",
    "SAT Math Avg. Score": "sat_math_avg",
    "SAT Writing Avg. Score": "sat_writing_avg",
    "pct_students_tested": "pct_students_tested",
    "academic_tier_rating": "academic_tier_rating"
}
df = df.rename(columns=rename_map)

# Drop redundant or synthetic columns
drop_cols = [c for c in ["SAT Critical Readng Avg. Score", "internal_school_id", "contact_extension"] if c in df.columns]
df = df.drop(columns=drop_cols, errors="ignore")

# Keep only useful columns
useful_cols = [
    "dbn", "school_name", "num_test_takers",
    "sat_reading_avg", "sat_math_avg", "sat_writing_avg",
    "pct_students_tested", "academic_tier_rating"
]
df = df[useful_cols]

# Helper function to convert to numeric
def to_num(series):
    return pd.to_numeric(series, errors="coerce")

# Convert numeric columns
df["num_test_takers"] = to_num(df["num_test_takers"])
for c in ["sat_reading_avg", "sat_math_avg", "sat_writing_avg", "academic_tier_rating"]:
    df[c] = to_num(df[c])

# Inconsistent formatting (e.g., "85%")
df["pct_students_tested"] = (
    df["pct_students_tested"].astype(str).str.strip().str.replace("%","", regex=False)
)
df["pct_students_tested"] = to_num(df["pct_students_tested"])

# Invalid SAT scores: Validate SAT ranges (200–800), set invalid values to NaN
for c in ["sat_reading_avg", "sat_math_avg", "sat_writing_avg"]:
    mask = df[c].between(200, 800, inclusive="both")
    df.loc[~mask, c] = np.nan

# Handle duplicates
df = df.drop_duplicates().reset_index(drop=True)

print("After cleaning:", df.shape)
df.head(10)

After cleaning: (478, 8)


Unnamed: 0,dbn,school_name,num_test_takers,sat_reading_avg,sat_math_avg,sat_writing_avg,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,78.0,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,92.0,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,92.0,2.0
5,01M515,LOWER EAST SIDE PREPARATORY HIGH SCHOOL,112.0,332.0,557.0,316.0,,3.0
6,01M539,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",159.0,522.0,574.0,525.0,78.0,2.0
7,01M650,CASCADES HIGH SCHOOL,18.0,417.0,418.0,411.0,92.0,4.0
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,130.0,624.0,604.0,628.0,,
9,02M047,47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECO...,16.0,395.0,400.0,387.0,78.0,4.0


## Quality checks

In [6]:
print("Row count:", len(df))
print("\nMissing values per column:")
print(df.isna().sum())

print("\nDescriptive statistics (numeric):")
display(df.describe())

print("\nSample rows:")
display(df.sample(5, random_state=42))

Row count: 478

Missing values per column:
dbn                       0
school_name               0
num_test_takers          57
sat_reading_avg          57
sat_math_avg             62
sat_writing_avg          57
pct_students_tested     115
academic_tier_rating     86
dtype: int64

Descriptive statistics (numeric):


Unnamed: 0,num_test_takers,sat_reading_avg,sat_math_avg,sat_writing_avg,pct_students_tested,academic_tier_rating
count,421.0,421.0,416.0,421.0,363.0,392.0
mean,110.320665,400.850356,413.733173,393.985748,84.595041,2.579082
std,155.534254,56.802783,64.945638,58.635109,5.673305,1.128053
min,6.0,279.0,312.0,286.0,78.0,1.0
25%,41.0,368.0,372.0,360.0,78.0,2.0
50%,62.0,391.0,395.0,381.0,85.0,3.0
75%,95.0,416.0,437.25,411.0,92.0,4.0
max,1277.0,679.0,735.0,682.0,92.0,4.0



Sample rows:


Unnamed: 0,dbn,school_name,num_test_takers,sat_reading_avg,sat_math_avg,sat_writing_avg,pct_students_tested,academic_tier_rating
469,75M035,P.S. 035,,,,,,4.0
33,02M416,ELEANOR ROOSEVELT HIGH SCHOOL,127.0,572.0,594.0,592.0,92.0,2.0
131,07X548,URBAN ASSEMBLY SCHOOL FOR CAREERS IN SPORTS,44.0,387.0,411.0,383.0,,2.0
72,02M630,ART AND DESIGN HIGH SCHOOL,270.0,444.0,441.0,430.0,85.0,
78,03M403,THE GLOBAL LEARNING COLLABORATIVE,,,,,78.0,1.0


## Save cleaned dataset

In [11]:
import os
import pandas as pd

# Use current working directory as base
BASE_DIR = os.getcwd()

# Create an output subfolder
OUT_DIR = os.path.join(BASE_DIR, "day_4_task")
os.makedirs(OUT_DIR, exist_ok=True)

# Define output CSV path
OUT_CSV = os.path.join(OUT_DIR, "cleaned_sat_results.csv")

# Save DataFrame (assuming df already exists)
df.to_csv(OUT_CSV, index=False)

# Print output path
OUT_CSV

'/Users/agborntui/day_4_task/cleaned_sat_results.csv'

In [22]:
from sqlalchemy import create_engine

engine = create_engine(
    "postgresql+psycopg2://username:password@localhost:5432/dbname"
)

In [24]:
# SQLAlchemy connection string format:
# postgresql+psycopg2://user:password@host:port/dbname

DATABASE_URL = (
    "postgresql://neondb_owner:npg_bunymKSa3s6I"
    "@ep-rapid-glitter-adr23e2n-pooler.c-2.us-east-1.aws.neon.tech/neondb"
    "?sslmode=require&channel_binding=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [57]:
# Dispose the engine to reset connections
engine.dispose()


In [58]:
from sqlalchemy import text

with engine.begin() as conn:  # begin() handles commit/rollback automatically
    # create schema if it doesn't exist
    conn.execute(text("CREATE SCHEMA IF NOT EXISTS nyc_schools"))
    
    # insert data
    new_df.to_sql(
        name='ntui_sat_results',
        con=conn,
        schema='nyc_schools',
        if_exists='replace',
        index=False
    )


In [59]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT COUNT(*) FROM nyc_schools.ntui_sat_results"))
    print("Number of rows in DB:", result.scalar())

Number of rows in DB: 478
