# DAY 4 

### 🧠 Task Summary

You’ll work with a real-world (and purposefully messy) SAT results dataset. Your goal is to:

    - Inspect and understand the structure of the dataset.
    - Select meaningful and relational columns that link to existing tables.
    - Identify issues in the data such as duplicates, outliers, or formatting inconsistencies.
    - Clean and preprocess the data using Python.
    - Prepare the data for database insertion.
    - Write a Python script that connects to the database and appends the cleaned data.

By completing this task, you’ll practice translating raw CSV data into relational database entries while thinking critically about schema and data integrity.



## Imports

In [None]:
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine, text
import os
import warnings
warnings.filterwarnings('ignore')

## Load Dataset

In [50]:
os.chdir(os.path.join(os.path.expanduser("~"), "Downloads"))

In [51]:
df_sat = pd.read_csv("sat-results.csv")
df_sat.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


### Cleaning Data

In [52]:
# Clean column names
df_sat.columns = (
    df_sat.columns
    .str.lower()
    .str.strip()
    .str.replace(" ", "_")
    .str.replace("[^a-z0-9_]", "", regex=True)
)

In [53]:
# school_name formatting - capitalize each word, remove extra spaces
df_sat['school_name'] = (
    df_sat['school_name']
    .str.title()       
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

In [54]:
df_sat.duplicated().sum()

np.int64(15)

In [55]:
# Drop duplicates based on 'dbn' column
df_sat = df_sat.drop_duplicates(subset='dbn')

In [56]:
# Drop duplicated column
df_sat = df_sat.drop(columns=["sat_critical_readng_avg_score"])

In [57]:
# % symbol removal, so we can convert to numeric later
df_sat['pct_students_tested'] = (
    df_sat['pct_students_tested']
    .astype(str)
    .str.replace('%', '', regex=False)
    .str.strip()
)

In [58]:
# Convert relevant columns to numeric
num_cols = ['num_of_sat_test_takers', 'sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score', 'pct_students_tested']
for col in num_cols:
    df_sat[col] = pd.to_numeric(df_sat[col], errors='coerce')

**Outlier Detection**

During data validation, the ``sat_math_avg_score`` column revealed two clear anomalies:
- A minimum value of -10
- A maximum value of 1100

In [60]:
# Outlier detection
df_sat.describe()[['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score']]

Unnamed: 0,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score
count,421.0,421.0,421.0
mean,400.850356,418.173397,393.985748
std,56.802783,88.210494,58.635109
min,279.0,-10.0,286.0
25%,368.0,372.0,360.0
50%,391.0,395.0,381.0
75%,416.0,438.0,411.0
max,679.0,1100.0,682.0


In [None]:
# create a dataframe to view outliers
df_outliers = df_sat[(df_sat['sat_math_avg_score'] > 800) | (df_sat['sat_math_avg_score'] < 200)] 

In [62]:
df_outliers

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
80,03M415,Wadleigh Secondary School For The Performing &...,32.0,371.0,850.0,370.0,365679,,78.0,4.0
181,10X225,Theatre Arts Production Company School,59.0,405.0,-10.0,394.0,827176,x345,78.0,
288,15K656,Brooklyn High School Of The Arts,141.0,426.0,999.0,411.0,642190,,,
422,28Q470,Jamaica High School,90.0,342.0,999.0,353.0,587904,x234,92.0,3.0
434,29Q283,Preparatory Academy For Writers: A College Boa...,43.0,370.0,1100.0,363.0,462173,x345,85.0,3.0


In [63]:
for c in ['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score']:
    # set to NaN if outside SAT range
    df_sat.loc[(df_sat[c] < 200) | (df_sat[c] > 800), c] = np.nan

In [None]:
# Define column names
col_names = df_sat.columns

# Placeholders to be replaced with NaN
placeholders = ['s', 'S', 'N/A', 'n/a', 'NaN', 'nan', 'None', '', ' ']

# Replace placeholders with NaN
df_sat[col_names] = df_sat[col_names].replace(placeholders, np.nan)

In [72]:
df_sat.describe()[['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score']]

Unnamed: 0,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score
count,421.0,416.0,421.0
mean,400.850356,413.733173,393.985748
std,56.802783,64.945638,58.635109
min,279.0,312.0,286.0
25%,368.0,372.0,360.0
50%,391.0,395.0,381.0
75%,416.0,437.25,411.0
max,679.0,735.0,682.0


In [64]:
df_sat['avg_sat_score'] = df_sat[['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score']].mean(axis=1).round(2)

## DB Connection

In [77]:
# SQLAlchemy connection string format:
# postgresql+psycopg2://user:password@host:port/dbname

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:a9Am7Yy5r9_T7h4OF2GN"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [78]:
df_sat.to_sql(
    name='marcella_ralser_sat_results',       
    con=engine,     
    schema='nyc_schools',
    if_exists='replace',    
    index=False            
)

478

In [80]:
df_sat.to_csv("cleaned_sat_results.csv", index=False)