In [15]:
df = pd.read_csv("sat-results.csv")


In [18]:
import os
print(os.listdir())  # shows files in current working directory


['.config', '.ipynb_checkpoints', 'cleaned_sat_results.csv', 'sat-results.csv', 'sample_data']


In [13]:
import pandas as pd

df = pd.read_csv("sat-results.csv")
print(df.columns.tolist())


['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


In [19]:
import pandas as pd
import numpy as np

RAW = "sat-results.csv"
CLEAN = "cleaned_sat_results.csv"

# Load with your actual headers
df = pd.read_csv(RAW)

# Select + rename from Title-Case → snake_case
keep_map = {
    "DBN": "dbn",
    "SCHOOL NAME": "school_name",
    "Num of SAT Test Takers": "num_of_test_takers",
    "SAT Critical Reading Avg. Score": "critical_reading_avg",
    "SAT Math Avg. Score": "math_avg",
    "SAT Writing Avg. Score": "writing_avg",
}
missing = [c for c in keep_map if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")
df = df[list(keep_map.keys())].rename(columns=keep_map)

# Convert numerics (strip %, commas, NA strings)
def to_number(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().replace("%","").replace(",","")
    if s.lower() in {"", "na", "n/a", "null", "none"}: return np.nan
    try: return float(s)
    except: return np.nan

num_cols = ["num_of_test_takers","critical_reading_avg","math_avg","writing_avg"]
for c in num_cols:
    df[c] = df[c].map(to_number)

# Enforce SAT score ranges (200–800)
for c in ["critical_reading_avg","math_avg","writing_avg"]:
    df.loc[~df[c].between(200, 800), c] = np.nan

# Key cleanup and de-dup
raw_rows = len(df)
df["dbn"] = df["dbn"].astype(str).str.strip()
df = df.dropna(subset=["dbn"]).drop_duplicates(subset=["dbn"], keep="first")
clean_rows = len(df)

# Tidy types
for c in ["critical_reading_avg","math_avg","writing_avg"]:
    df[c] = df[c].round().astype("Int64")
df["num_of_test_takers"] = df["num_of_test_takers"].fillna(0).round().astype("Int64")

# Save + show summary
df.to_csv(CLEAN, index=False)
summary = {
    "raw_rows": raw_rows,
    "clean_rows": clean_rows,
    "dropped_or_deduped": raw_rows - clean_rows,
    "nulls_reading": int(df["critical_reading_avg"].isna().sum()),
    "nulls_math": int(df["math_avg"].isna().sum()),
    "nulls_writing": int(df["writing_avg"].isna().sum()),
    "output_file": CLEAN
}
print(summary)
df.head()


{'raw_rows': 493, 'clean_rows': 478, 'dropped_or_deduped': 15, 'nulls_reading': 57, 'nulls_math': 62, 'nulls_writing': 57, 'output_file': 'cleaned_sat_results.csv'}


Unnamed: 0,dbn,school_name,num_of_test_takers,critical_reading_avg,math_avg,writing_avg
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384


In [20]:
df.info()
df.describe(include="all")
df.isna().sum()


<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   dbn                   478 non-null    object
 1   school_name           478 non-null    object
 2   num_of_test_takers    478 non-null    Int64 
 3   critical_reading_avg  421 non-null    Int64 
 4   math_avg              416 non-null    Int64 
 5   writing_avg           421 non-null    Int64 
dtypes: Int64(4), object(2)
memory usage: 28.0+ KB


Unnamed: 0,0
dbn,0
school_name,0
num_of_test_takers,0
critical_reading_avg,57
math_avg,62
writing_avg,57


In [23]:
!pip install -q duckdb
import duckdb
con = duckdb.connect("sat_project.duckdb")


In [22]:
import pandas as pd

# Load your cleaned CSV
df = pd.read_csv("cleaned_sat_results.csv")

# Create table in DuckDB
con.execute("""
    CREATE TABLE IF NOT EXISTS sat_results AS
    SELECT * FROM df
""")

# Quick check
con.execute("SELECT COUNT(*) AS total_rows FROM sat_results").fetchdf()


Unnamed: 0,total_rows
0,478


In [24]:
## Total rows in dataset
con.execute("SELECT COUNT(*) AS total_rows FROM sat_results").fetchdf()


Unnamed: 0,total_rows
0,478


In [26]:

#   unique schools (dbn)  present in the dataset?

con.execute("SELECT COUNT(DISTINCT dbn) AS unique_schools FROM sat_results").fetchdf()


Unnamed: 0,unique_schools
0,478


In [27]:

#  null values in each subject column (Reading, Math, Writing)?

con.execute("""
SELECT
  SUM(CASE WHEN critical_reading_avg IS NULL THEN 1 ELSE 0 END) AS nulls_reading,
  SUM(CASE WHEN math_avg IS NULL THEN 1 ELSE 0 END) AS nulls_math,
  SUM(CASE WHEN writing_avg IS NULL THEN 1 ELSE 0 END) AS nulls_writing
FROM sat_results
""").fetchdf()


Unnamed: 0,nulls_reading,nulls_math,nulls_writing
0,57.0,62.0,57.0


In [28]:

#  10 schools with the highest combined SAT average (Reading + Math + Writing)?

con.execute("""
SELECT
  school_name,
  (COALESCE(critical_reading_avg,0)
   + COALESCE(math_avg,0)
   + COALESCE(writing_avg,0)) AS total_avg
FROM sat_results
ORDER BY total_avg DESC
LIMIT 10
""").fetchdf()


Unnamed: 0,school_name,total_avg
0,STUYVESANT HIGH SCHOOL,2096.0
1,BRONX HIGH SCHOOL OF SCIENCE,1969.0
2,STATEN ISLAND TECHNICAL HIGH SCHOOL,1953.0
3,HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE,1920.0
4,TOWNSEND HARRIS HIGH SCHOOL,1910.0
5,QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO...,1868.0
6,BARD HIGH SCHOOL EARLY COLLEGE,1856.0
7,"HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN...",1847.0
8,BROOKLYN TECHNICAL HIGH SCHOOL,1833.0
9,ELEANOR ROOSEVELT HIGH SCHOOL,1758.0


In [29]:

#  overall average score for each subject (Reading, Math, Writing) across all schools?

con.execute("""
SELECT
  ROUND(AVG(critical_reading_avg),2) AS avg_reading,
  ROUND(AVG(math_avg),2) AS avg_math,
  ROUND(AVG(writing_avg),2) AS avg_writing
FROM sat_results
""").fetchdf()


Unnamed: 0,avg_reading,avg_math,avg_writing
0,400.85,413.73,393.99


In [30]:

#  math scores distribution across score ranges?

con.execute("""
SELECT
  CASE
    WHEN math_avg < 350 THEN '<350'
    WHEN math_avg BETWEEN 350 AND 399 THEN '350–399'
    WHEN math_avg BETWEEN 400 AND 449 THEN '400–449'
    WHEN math_avg BETWEEN 450 AND 499 THEN '450–499'
    ELSE '500+'
  END AS math_bucket,
  COUNT(*) AS num_schools
FROM sat_results
WHERE math_avg IS NOT NULL
GROUP BY 1
ORDER BY 1
""").fetchdf()


Unnamed: 0,math_bucket,num_schools
0,350–399,196
1,400–449,112
2,450–499,48
3,500+,34
4,<350,26
