Load the dataset

In [11]:
import os
import pandas as pd

# Get your home directory dynamically
home = os.path.expanduser("~")

# Build path to Downloads folder
file_path = os.path.join(home, "Downloads", "sat-results.csv")

# Load the file
df = pd.read_csv(file_path)

print("File loaded successfully:", df.shape)
df.head()


File loaded successfully: (493, 11)


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


493 rows ,The dataset has 493 entries (schools or records).

11 columns ,Each row has 11 attributes (fields), such as DBN, school name, SAT scores, etc.

Exploring the already loaded dataframe

In [12]:
print(df.shape)
print(df.columns)
df.head()


(493, 11)
Index(['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers',
       'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score',
       'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score',
       'internal_school_id', 'contact_extension', 'pct_students_tested',
       'academic_tier_rating'],
      dtype='object')


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


CLEAN THE DATA : 
# - Normalize headers
# - Remove % and commas
# - Convert to numeric
# - Drop duplicates
# - Validate SAT scores (200–800)
# - Save a cleaned CSV file

In [13]:
import pandas as pd

# === 1. Load raw SAT results ===
df = pd.read_csv("sat-results.csv")

print("Original shape:", df.shape)
print("Original columns:", df.columns.tolist())
print(df.head(3))


# === 2. Normalize headers ===
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace(r"[^a-z0-9_]", "", regex=True)
)

print("\nNormalized columns:", df.columns.tolist())


# === 3. Remove % signs, convert to numeric ===
for col in df.columns:
    if df[col].dtype == "object":
        # Try cleaning percentages
        df[col] = (
            df[col]
            .astype(str)
            .str.replace("%", "", regex=False)
            .str.replace(",", "", regex=False)
        )

        # Convert to numeric where possible
        df[col] = pd.to_numeric(df[col], errors="ignore")


# === 4. Drop duplicates ===
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"\nDropped {before - after} duplicate rows.")


# === 5. Handle SAT score columns (200–800 valid range) ===
sat_cols = [c for c in df.columns if "sat" in c]

for col in sat_cols:
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col] = df[col].where((df[col] >= 200) & (df[col] <= 800))


# === 6. Inspect cleaned data ===
print("\nCleaned data preview:")
print(df.head(5))
print("\nSAT score stats:")
print(df[sat_cols].describe())


# === 7. Save cleaned version ===
df.to_csv("cleaned_sat_results.csv", index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_sat_results.csv'")


Original shape: (493, 11)
Original columns: ['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']
      DBN                                    SCHOOL NAME  \
0  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448            UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                     EAST SIDE COMMUNITY SCHOOL   

  Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score  \
0                     29                             355                 404   
1                     91                             383                 423   
2                     70                             377                 402   

  SAT Writing Avg. Score SAT Critical Readng Avg. Score  internal_school_id  \
0                    363                            355   

  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")


weiter cleaning

In [16]:
import pandas as pd

# Load dataset (update path as needed)
df = pd.read_csv("sat-results.csv")

# Columns for SAT scores
sat_columns = [
    "sat_math_avg_score",
    "sat_critical_reading_avg_score",
    "sat_writing_avg_score"
]

# Clean SAT columns
for col in sat_columns:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace("%", "", regex=False)
            .str.replace(",", "", regex=False)
        )
        df[col] = pd.to_numeric(df[col], errors="coerce")

        # Keep only valid SAT scores (200–800)
        df.loc[(df[col] < 200) | (df[col] > 800), col] = None

# Drop rows wit


In [24]:
print(df.columns.tolist())


['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


In [None]:
CHECK COLUMN NAMES

In [26]:
# Normalize column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace(r"[^a-z0-9_]", "", regex=True)  # remove non-alphanumeric chars
)

# Check new column names
print(df.columns.tolist())


['dbn', 'school_name', 'num_of_sat_test_takers', 'sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score', 'sat_critical_readng_avg_score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


Exploratory analysis tool for the cleaned SAT dataset.

In [29]:
import pandas as pd

# Assuming df_clean is ready from the cleaning step
# Add a total SAT score column
df_clean["total_sat"] = (
    df_clean["sat_critical_reading_avg_score"] +
    df_clean["sat_math_avg_score"] +
    df_clean["sat_writing_avg_score"]
)

# 1. Schools with valid SAT scores
valid_schools = df_clean["dbn"].nunique()
print("✅ Schools with valid SAT scores:", valid_schools)

# 2. Average SAT scores
avg_scores = df_clean[["sat_critical_reading_avg_score",
                       "sat_math_avg_score",
                       "sat_writing_avg_score",
                       "total_sat"]].mean().round(2)
print("\n📊 Average SAT Scores:")
print(avg_scores)

# 3. Top 10 schools by total SAT
top10 = df_clean.nlargest(10, "total_sat")[["school_name", "total_sat"]]
print("\n🏆 Top 10 Schools by Total SAT:")
print(top10)

# 4. Worst 10 schools by total SAT
bottom10 = df_clean.nsmallest(10, "total_sat")[["school_name", "total_sat"]]
print("\n⚠️ Bottom 10 Schools by Total SAT:")
print(bottom10)

# 5. Distribution summary for Math
print("\n📈 Math Score Distribution:")
print(df_clean["sat_math_avg_score"].describe())


✅ Schools with valid SAT scores: 421

📊 Average SAT Scores:
sat_critical_reading_avg_score     400.71
sat_math_avg_score                 413.34
sat_writing_avg_score              393.71
total_sat                         1208.16
dtype: float64

🏆 Top 10 Schools by Total SAT:
                                           school_name  total_sat
48                              STUYVESANT HIGH SCHOOL     2096.0
198                       BRONX HIGH SCHOOL OF SCIENCE     1969.0
459                STATEN ISLAND TECHNICAL HIGH SCHOOL     1953.0
206  HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE     1920.0
396                        TOWNSEND HARRIS HIGH SCHOOL     1910.0
427  QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO...     1868.0
8                       BARD HIGH SCHOOL EARLY COLLEGE     1856.0
107  HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN...     1847.0
249                     BROOKLYN TECHNICAL HIGH SCHOOL     1833.0
33                       ELEANOR ROOSEVELT HIGH SCHOOL     1758.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["total_sat"] = (


CHECK IF the CLEAN DATA EXIST

In [31]:
import pandas as pd
df=pd.read_csv('sat-results.csv')
df

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...,...
488,27Q480,JOHN ADAMS HIGH SCHOOL,403,391,409,392,391,863765,,92%,1.0
489,13K605,GEORGE WESTINGHOUSE CAREER AND TECHNICAL EDUCA...,85,406,391,392,406,937579,x234,,
490,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
491,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0


CHECK DUPLICATES and drop them

In [32]:
new_df=df.drop_duplicates()
new_df

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,s,s,s,s,s,733698,x234,92%,1.0
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,s,s,s,s,s,976034,x345,,4.0
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,s,s,s,s,s,556924,x123,85%,1.0
476,79Q950,GED PLUS s CITYWIDE,8,496,400,426,496,544514,x234,92%,2.0


In [None]:
CONNECT WITH NEON

In [35]:
from sqlalchemy import create_engine

engine = create_engine(
    "postgresql+psycopg2://username:password@localhost:5432/dbname"
)


In [39]:
# SQLAlchemy connection string format:
# postgresql+psycopg2://user:password@host:port/dbname

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [40]:
new_df.to_sql(
    name='vanessa_sat_results',       
    con=engine,     
    schema='nyc_schools',
    if_exists='replace',    
    index=False            
)

478