# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
from sqlalchemy import create_engine

# loading the Dataset

In [2]:
df = pd.read_csv('/Users/essamalasaad/Desktop/Webeet Day4/sat-results.csv')

# Exploring The Dataset

In [3]:
# Exploring first 5 rows
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [4]:
# Exploring the structure of the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


# Cleaning the Data

In [5]:
# Lowercase, replace spaces and fix typos
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('.', '', regex=False)
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [6]:
#Drop duplicated rows
df = df.drop_duplicates()
df.shape

(478, 11)

In [7]:
#Drop Duplicate Columns
df = df.loc[:,~df.columns.duplicated()]
df.shape

(478, 11)

In [8]:
# Dropping unncessary columns
df = df.drop(columns=['contact_extension', 'sat_critical_readng_avg_score'])
df.shape

(478, 9)

In [9]:
# Convert numeric SAT columns to proper dtype and handle invalid entries
# Columns that should be numeric
sat_columns = [
    'num_of_sat_test_takers',
    'sat_critical_reading_avg_score',
    'sat_math_avg_score',
    'sat_writing_avg_score',
    'academic_tier_rating'
] 
for col in sat_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [10]:
# Clean Percentages
df['pct_students_tested'] = df['pct_students_tested'].str.replace('%', '').astype(float)

In [11]:
# Filter out outliers (valid SAT score range: 200–800)

score_cols = ['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score']
for col in score_cols:
    df = df[df[col].between(200, 800, inclusive='both') | df[col].isna()]

In [12]:
# Drop rows with too many values missing
df_clean = df.dropna(subset=[
    'num_of_sat_test_takers',
    'sat_critical_reading_avg_score',
    'sat_math_avg_score',
    'sat_writing_avg_score'
])

In [13]:
# Final overview of cleaned data
print('df_clean shape:', df_clean.shape)
print('df_clean info:')
df_clean.info()
print('df_clean head:')
df_clean.head()

df_clean shape: (416, 9)
df_clean info:
<class 'pandas.core.frame.DataFrame'>
Index: 416 entries, 0 to 477
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             416 non-null    object 
 1   school_name                     416 non-null    object 
 2   num_of_sat_test_takers          416 non-null    float64
 3   sat_critical_reading_avg_score  416 non-null    float64
 4   sat_math_avg_score              416 non-null    float64
 5   sat_writing_avg_score           416 non-null    float64
 6   internal_school_id              416 non-null    int64  
 7   pct_students_tested             313 non-null    float64
 8   academic_tier_rating            349 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 32.5+ KB
df_clean head:


Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,218160,78.0,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,268547,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,236446,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,427826,92.0,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,672714,92.0,2.0


In [14]:
# saving the clean dataset
df_clean.to_csv('cleaned_sat_results.csv', index=False)

In [15]:
import os
print(os.getcwd())


/Users/essamalasaad/Desktop/Internship/Webeet Intro week/Webeet Day4


# Design the Schema

In [16]:
# Connecting to the database
Database_url = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)
engine = create_engine(Database_url)

In [17]:
# Save cleaned DataFrame to PostgreSQL
df_clean.to_sql('Essam_alasaad_sat_results', engine, schema='nyc_schools', if_exists='replace', index=False)

416