# Day 4 Task: SAT Modelling

## 1. Loading Libraries & Connecting data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('sat-results.csv')
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


## 2. Data Cleaning

### Column names
- All lowercase

- Replacing blank spaces with '_' 

In [3]:
df.columns = df.columns.str.strip().str.lower().str.replace(r'[^0-9A-Za-z]+','_', regex=True)
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


### Dropping duplicates
- Using dbn as the unique identifier to drop duplicates

In [5]:
df = df.drop_duplicates(subset=["dbn"])
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [8]:
# Check if both versions of the column exist
if "sat_critical_reading_avg_score" in df.columns and "sat_critical_readng_avg_score" in df.columns:
    if df["sat_critical_reading_avg_score"].equals(df["sat_critical_readng_avg_score"]):
        # If identical, drop the typo column
        df = df.drop(columns=["sat_critical_readng_avg_score"])
        print("Dropped typo column (values were identical).")
    else:
        # If not identical, show number of differences
        differences = df[df["sat_critical_reading_avg_score"] != df["sat_critical_readng_avg_score"]]
        print(f"Columns differ in {len(differences)} rows. Please investigate before dropping.")
else:
    print("One or both columns are missing — no action taken.")

One or both columns are missing — no action taken.


There are 2 columns with the same name, except for a typo. 
1. "sat_critical_reading_avg_score"

2. "sat_critical_readng_avg_score" - there is a missing 'i' in reading

I wrote a script to check if the columns were identical in which case, the column has been dropped automatically but if the columns were not fully identical then I would have recieved a message to investigate the anomaly further.

Note that I ran this code initially and dropped the column with the typo as it was a duplicate of the first column, however I ran the code for a second time to double check and it shows this message "One or both columns are missing — no action taken." as the duplicate column has already been dropped

## 3. Saving cleaned dataset

In [13]:
keep_cols = [
    "dbn",
    "school_name",
    "num_of_sat_test_takers",
    "sat_critical_reading_avg_score",
    "sat_math_avg_score",
    "sat_writing_avg_score",
    "internal_school_id",
    "contact_extension",
    "pct_students_tested",
    "academic_tier_rating"
]

df = df[keep_cols]

In [14]:
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,218160,x345,78.0,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,427826,x123,92.0,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,672714,x123,92.0,2.0


## 4. Checking SAT scores
Ensuring SAT score columns are floats for more accurate viewing of the average percentages
Valid scores are between 200–800

In [4]:
float_cols = [
    "num_of_sat_test_takers",
    "sat_critical_reading_avg_score",
    "sat_math_avg_score",
    "sat_writing_avg_score"
]

for col in float_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [5]:
for col in ["sat_critical_reading_avg_score", "sat_math_avg_score", "sat_writing_avg_score"]:
    if col in df.columns:
        df.loc[(df[col] < 200) | (df[col] > 800), col] = None


In [6]:
sat_cols = ["sat_critical_reading_avg_score", "sat_math_avg_score", "sat_writing_avg_score"]
df = df.dropna(subset=sat_cols, how="all")

df["pct_students_tested"] = (
    df["pct_students_tested"]
    .astype(str)
    .str.replace("%", "", regex=False)
)
df["pct_students_tested"] = pd.to_numeric(df["pct_students_tested"], errors="coerce") / 100

In [7]:
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,355,218160,x345,0.78,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,414,427826,x123,0.92,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,390,672714,x123,0.92,2.0


In [8]:
df.to_csv("cleaned_sat_results.csv", index=False)

## 5. Appending the data

In [9]:
import pandas as pd
import psycopg2
from psycopg2.extras import execute_batch


df = pd.read_csv("cleaned_sat_results.csv")

df["num_of_sat_test_takers"] = pd.to_numeric(df["num_of_sat_test_takers"], errors="coerce")

sat_cols = [
    "sat_critical_reading_avg_score",
    "sat_math_avg_score",
    "sat_writing_avg_score"
]

for col in sat_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df = df.dropna(subset=sat_cols, how="all")

for col in sat_cols:
    df.loc[~df[col].between(200, 800, inclusive="both"), col] = None

if df["pct_students_tested"].dtype == "object":
    df["pct_students_tested"] = (
        df["pct_students_tested"]
        .str.replace("%", "", regex=False)
        .astype(float) / 100
    )


conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="npg_CeS9fJg2azZD",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)
cur = conn.cursor()


rows = [
    (
        row["dbn"],
        row["school_name"],
        row["num_of_sat_test_takers"],
        row["sat_critical_reading_avg_score"],
        row["sat_math_avg_score"],
        row["sat_writing_avg_score"],
        row["internal_school_id"],
        row["contact_extension"],
        row["pct_students_tested"],
        row["academic_tier_rating"]
    )
    for _, row in df.iterrows()
]


insert_query = """
INSERT INTO nyc_schools.isabella_leach_sat_results (
    dbn, school_name, num_sat_test_takers,
    sat_critical_reading_avg_score, sat_math_avg_score, sat_writing_avg_score,
    internal_school_id, contact_extension, pct_students_tested, academic_tier_rating
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (dbn) DO NOTHING;
"""

execute_batch(cur, insert_query, rows)
conn.commit()
print(f" {len(rows)} rows inserted in batch")


cur.execute("SELECT COUNT(*) FROM nyc_schools.isabella_leach_sat_results;")
print("Total rows in table:", cur.fetchone()[0])




 435 rows inserted in batch
Total rows in table: 421
