# 🧮 Day 4 – Data Integration & Schema Design

In [31]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv("/Users/mariannagokova/anaconda_projects/1782611f-cdfa-4664-bd89-a0f7605dfe0d/day4/sat-results.csv")
print(df.shape)
df.head()

(493, 11)


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


In [33]:
df.columns = (
    df.columns.str.strip()                       # remove leading/trailing spaces
              .str.lower()                       # lowercase all names
              .str.replace(r"\s+", "_", regex=True)  # replace spaces/tabs with "_"
              .str.replace(r"[^\w_]", "", regex=True) # remove dots, parentheses, etc.
)

In [34]:
keep = [
        "dbn",
        "school_name",
        "num_of_sat_test_takers",
        "sat_critical_reading_avg_score",
        "sat_math_avg_score",
        "sat_writing_avg_score",
        "pct_students_tested",
        "academic_tier_rating"
    ]
df = df[[c for c in keep if c in df.columns]]


In [35]:
#notmalisation key columns
if "dbn" in df.columns:
    df["dbn"] = (
        df["dbn"].astype(str)
                 .str.strip()
                 .str.upper()
                 .str.replace(r"[^0-9A-Z]", "", regex=True)
    )

if "school_name" in df.columns:
    df["school_name"] = (
        df["school_name"].astype(str)
                         .str.strip()
                         .str.replace(r"\s+", " ", regex=True)
    )

In [36]:
#replace "s", "S"
df = df.replace({"s": np.nan, "S": np.nan})

In [37]:

#numeric columns- must be numeric, no string "s"
num_cols = ["num_of_sat_test_takers", "sat_critical_reading_avg_score", "sat_math_avg_score", "sat_writing_avg_score", "academic_tier_rating"]
for col in num_cols:
    if col in df.columns:
        #exclude "s"
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(r"[^0-9\.\-]", "", regex=True)  
            .replace({"": np.nan, ".": np.nan, "-": np.nan})  
        )
        df[col] = pd.to_numeric(df[col], errors="coerce")

In [41]:
# and with diapason(filter)

if "sat_critical_reading_avg_score" in df.columns:
    df.loc[~df["sat_critical_reading_avg_score"].between(200, 800), "sat_critical_reading_avg_score"] = np.nan
if "sat_math_avg_score" in df.columns:
    df.loc[~df["sat_math_avg_score"].between(200, 800), "sat_math_avg_score"] = np.nan
if "sat_writing_avg_score" in df.columns:
    df.loc[~df["sat_writing_avg_score"].between(200, 800), "sat_writing_avg_score"] = np.nan
if "academic_tier_rating" in df.columns:
    df.loc[~df["academic_tier_rating"].between(1, 4), "academic_tier_rating"] = np.nan

In [42]:
print(df.dtypes)   


dbn                                object
school_name                        object
num_of_sat_test_takers            float64
sat_critical_reading_avg_score    float64
sat_math_avg_score                float64
sat_writing_avg_score             float64
pct_students_tested                object
academic_tier_rating              float64
dtype: object


In [45]:

#percents
if "pct_students_tested" in df.columns:
    df["pct_students_tested"] = (
        df["pct_students_tested"].astype(str).str.replace("%", "", regex=False)
    )
    df["pct_students_tested"] = pd.to_numeric(df["pct_students_tested"], errors="coerce")
    df["pct_students_tested"] = df["pct_students_tested"].where(
        df["pct_students_tested"] <= 1, df["pct_students_tested"] / 100
    )
   # replace 0 with NaN (since 0 means no data)
    df.loc[df["pct_students_tested"] == 0, "pct_students_tested"] = np.nan

In [47]:
for col in [c for c in num_cols if c in df.columns and c != "num_of_sat_test_takers"]:
    df[col] = df[col].astype("float64")


if "num_of_sat_test_takers" in df.columns:
    df["num_of_sat_test_takers"] = pd.to_numeric(
        df["num_of_sat_test_takers"], errors="coerce"
    ).astype("Int64")

In [48]:
# drop rows where all 3 SAT scores are NaN
sat_cols = [c for c in ["sat_critical_reading_avg_score", "sat_math_avg_score", "sat_writing_avg_score"] if c in df.columns]
if sat_cols:
    df = df.dropna(subset=sat_cols, how="all")

# drop duplicates on dbn
if "dbn" in df.columns:
    df = df.drop_duplicates(subset=["dbn"])

In [50]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 421 entries, 0 to 477
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             421 non-null    object 
 1   school_name                     421 non-null    object 
 2   num_of_sat_test_takers          421 non-null    Int64  
 3   sat_critical_reading_avg_score  421 non-null    float64
 4   sat_math_avg_score              416 non-null    float64
 5   sat_writing_avg_score           421 non-null    float64
 6   pct_students_tested             317 non-null    float64
 7   academic_tier_rating            352 non-null    float64
dtypes: Int64(1), float64(5), object(2)
memory usage: 30.0+ KB


In [51]:
df.describe()

Unnamed: 0,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested,academic_tier_rating
count,421.0,421.0,416.0,421.0,317.0,352.0
mean,110.320665,400.850356,413.733173,393.985748,0.846688,2.585227
std,155.534254,56.802783,64.945638,58.635109,0.057104,1.118917
min,6.0,279.0,312.0,286.0,0.78,1.0
25%,41.0,368.0,372.0,360.0,0.78,2.0
50%,62.0,391.0,395.0,381.0,0.85,3.0
75%,95.0,416.0,437.25,411.0,0.92,4.0
max,1277.0,679.0,735.0,682.0,0.92,4.0


In [52]:
df_sat_cleaned= df.reset_index(drop=True)
df_sat_cleaned.to_csv("/Users/mariannagokova/anaconda_projects/1782611f-cdfa-4664-bd89-a0f7605dfe0d/day4/cleaned_sat_results.csv", index=False)
print("✅ saved cleaned CSV")

✅ saved cleaned CSV


In [53]:
# DB connection setup using hardcoded credentials (for onboarding only)
import psycopg2
from psycopg2.extras import execute_values

conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="npg_CeS9fJg2azZD",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)
cur = conn.cursor()

In [54]:
create_table_sql = """
CREATE TABLE IF NOT EXISTS nyc_schools.marianna_gokova_cleaned_sat_results (
  dbn                      VARCHAR(15) PRIMARY KEY,  
  school_name              TEXT NOT NULL,            
  num_of_sat_test_takers   INT,                     
  sat_critical_reading_avg_score      REAL,                   
  sat_math_avg_score       REAL,                     
  sat_writing_avg_score    REAL,                     
  pct_students_tested      REAL,                     
  academic_tier_rating     REAL                      
);

"""
cur.execute(create_table_sql)
conn.commit()
print("✅ Table created (if not exists)")

✅ Table created (if not exists)


In [55]:
df = pd.read_csv("/Users/mariannagokova/anaconda_projects/1782611f-cdfa-4664-bd89-a0f7605dfe0d/day4/cleaned_sat_results.csv")

In [56]:
cols = list(df.columns)
sql = f"""
INSERT INTO nyc_schools.marianna_gokova_cleaned_sat_results ({','.join(cols)})
VALUES %s
ON CONFLICT (dbn) DO NOTHING;
"""
execute_values(cur, sql, df.to_records(index=False).tolist())
conn.commit()

In [57]:
pd.read_sql("SELECT * FROM nyc_schools.marianna_gokova_cleaned_sat_results LIMIT 5;", conn)

  pd.read_sql("SELECT * FROM nyc_schools.marianna_gokova_cleaned_sat_results LIMIT 5;", conn)


Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355.0,404.0,363.0,0.78,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383.0,423.0,366.0,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377.0,402.0,370.0,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414.0,401.0,359.0,0.92,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390.0,433.0,384.0,0.92,2.0
