Day 4 — SAT Results ETL (Explore → Clean → Load)


In [None]:
Objective: Evaluate, clean, and integrate SAT results into existing PostgreSQL schema.



. Normalize headers (lowercase, underscores, no special chars).

. Remove duplicates.

. Fix formatting issues (e.g., "85%" → 85).

. Validate SAT scores (200–800 only, invalid → NaN).

. Drop irrelevant columns if needed.

In [55]:
 
import pandas as pd
import re

# Update the file path to the correct location of your CSV file
# You need to specify the actual path where your data file is located
file_path = "2012-sat-results.csv"  # Changed from "/mnt/data/2012-sat-results.csv"

# Load raw dataset
df = pd.read_csv(file_path)

# Clean column names
df.columns = (
    df.columns
    .str.lower()
    .str.strip()
    .str.replace(" ", "_", regex=False)
    .str.replace(r"[^a-z0-9_]", "", regex=True)
)

# Drop duplicates
df = df.drop_duplicates()

# Remove "%" signs and convert to numeric where applicable
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].astype(str).str.replace("%", "", regex=False)

# Convert SAT score columns to numeric and filter valid scores (200–800)
sat_cols = ["sat_math_avg_score", "sat_critical_reading_avg_score", "sat_writing_avg_score"]
for col in sat_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df.loc[(df[col] < 200) | (df[col] > 800), col] = None  # mark invalid as NaN

# Update the output path to a location where you have write permissions
cleaned_path = "cleaned_sat_results.csv"  # Changed from "/mnt/data/cleaned_sat_results.csv"

# Save cleaned CSV
df.to_csv(cleaned_path, index=False)

print(f"Cleaned dataset saved to {cleaned_path}")
df.head()

Cleaned dataset saved to cleaned_sat_results.csv


Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355.0,404.0,363.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383.0,423.0,366.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377.0,402.0,370.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414.0,401.0,359.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390.0,433.0,384.0


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             478 non-null    object 
 1   school_name                     478 non-null    object 
 2   num_of_sat_test_takers          478 non-null    object 
 3   sat_critical_reading_avg_score  421 non-null    float64
 4   sat_math_avg_score              421 non-null    float64
 5   sat_writing_avg_score           421 non-null    float64
dtypes: float64(3), object(3)
memory usage: 22.5+ KB


In [72]:
import sqlite3

# Connect to a database (creates one if it doesn't exist)
conn = sqlite3.connect('your_database.db')
cursor = conn.cursor()

# Execute the CREATE TABLE statement
cursor.execute('''
CREATE TABLE yourname_sat_scores (
    dbn TEXT PRIMARY KEY,
    school_name TEXT,
    borough TEXT,
    sat_math_avg_score INT,
    sat_critical_reading_avg_score INT,
    sat_writing_avg_score INT
)
''')

# Commit the changes and close the connection
conn.commit()
conn.close()

print("Table created successfully")

Table created successfully


Python Script to Append Data into PostgreSQL

In [43]:
from sqlalchemy import create_engine

# Database connection - corrected format
# The username and password should be part of the connection string, not separate variables
# The host should not include the @ symbol in the variable
# The database name should be the actual database name, not a CSV file
db_user = "neondb_owner"
db_pass = "npg_CeS9fJg2azZD"
db_host = "ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech"
db_port = "5432"
db_name = "neondb"  # Changed from CSV filename to actual database name

# Properly formatted connection string
engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}")

# Append cleaned dataset to PostgreSQL
df.to_sql("ucheokoli_sat_scores", engine, if_exists="append", index=False)

print("✅ Data appended successfully to PostgreSQL")

✅ Data appended successfully to PostgreSQL


In [56]:
print(df.columns.tolist())


['dbn', 'school_name', 'num_of_sat_test_takers', 'sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score']


In [63]:
new_df=df.drop_duplicates()
new_df

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355.0,404.0,363.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383.0,423.0,366.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377.0,402.0,370.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414.0,401.0,359.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390.0,433.0,384.0
...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,s,,,
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,s,,,
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,s,,,
476,79Q950,GED PLUS s CITYWIDE,8,496.0,400.0,426.0


In [64]:
# remove the duplicates for each duplicated dbn
df.drop_duplicates(subset=['dbn'], keep='first', inplace=True)
df.shape


(478, 6)

In [67]:
#rechecking number of missing values in each column
df.isna().sum()

dbn                                0
school_name                        0
num_of_sat_test_takers             0
sat_critical_reading_avg_score    57
sat_math_avg_score                57
sat_writing_avg_score             57
dtype: int64

In [70]:
new_df.shape

(478, 6)