## Populating Database

## Import Libraries

In [150]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")


## Database Connection

In [152]:
Database_URL = ("postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
                "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
                "?sslmode=require"
               )
engine = create_engine(Database_URL)

## Inspect and understand the dataset

In [154]:

# Load dataset
df = pd.read_csv('sat-results.csv')
df

# Quick overview
print(df.shape)          
print(df.columns)         
print(df.info())          
print(df.head())          
print(df.describe())     


(493, 11)
Index(['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers',
       'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score',
       'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score',
       'internal_school_id', 'contact_extension', 'pct_students_tested',
       'academic_tier_rating'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal

## Identify data issues

In [156]:
# Check duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

# Check missing values
print(df.isnull().sum())

# Check outliers 
from scipy import stats
numeric_cols = df.select_dtypes(include='number').columns
z_scores = stats.zscore(df[numeric_cols], nan_policy='omit')
outliers = (abs(z_scores) > 3).sum(axis=0)
print("Potential outliers per numeric column:\n", outliers)

# Check for formatting inconsistencies in string columns
string_cols = df.select_dtypes(include='object').columns
for col in string_cols:
    print(col, df[col].unique()[:10])  


Duplicate rows: 15
DBN                                  0
SCHOOL NAME                          0
Num of SAT Test Takers               0
SAT Critical Reading Avg. Score      0
SAT Math Avg. Score                  0
SAT Writing Avg. Score               0
SAT Critical Readng Avg. Score       0
internal_school_id                   0
contact_extension                  105
pct_students_tested                117
academic_tier_rating                91
dtype: int64
Potential outliers per numeric column:
 [0 0]
DBN ['01M292' '01M448' '01M450' '01M458' '01M509' '01M515' '01M539' '01M650'
 '01M696' '02M047']
SCHOOL NAME ['HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES'
 'UNIVERSITY NEIGHBORHOOD HIGH SCHOOL' 'EAST SIDE COMMUNITY SCHOOL'
 'FORSYTH SATELLITE ACADEMY' 'MARTA VALLE HIGH SCHOOL'
 'LOWER EAST SIDE PREPARATORY HIGH SCHOOL'
 'NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND MATH HIGH SCHOOL'
 'CASCADES HIGH SCHOOL' 'BARD HIGH SCHOOL EARLY COLLEGE'
 '47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SE

## Clean and preprocess the data

In [158]:
# Remove duplicates
df_clean = df.drop_duplicates().copy()
df_clean


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,s,s,s,s,s,733698,x234,92%,1.0
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,s,s,s,s,s,976034,x345,,4.0
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,s,s,s,s,s,556924,x123,85%,1.0
476,79Q950,GED PLUS s CITYWIDE,8,496,400,426,496,544514,x234,92%,2.0


## Drop Unuseful Columns

In [182]:
drop_columns_df = df_clean.drop(columns=["internal_school_id", "contact_extension", "SAT Critical Readng Avg. Score"])
drop_columns_df

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,92%,2.0
...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,s,s,s,s,92%,1.0
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,s,s,s,s,,4.0
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,s,s,s,s,85%,1.0
476,79Q950,GED PLUS s CITYWIDE,8,496,400,426,92%,2.0


## Fill NaN in pct_students_tested

use the mean and fill the missing information(NaN) with the sum of all values divided by number of values.

In [162]:
df_columns = drop_columns_df.copy()
s = df_columns['pct_students_tested']\
  .astype(str)\
  .str.strip()\
  .str.replace('%', '', regex=False)

s = pd.to_numeric(s, errors='coerce')

mean_val = s.mean()
df_columns['pct_students_tested'] = (
    s.fillna(mean_val).round().astype('int64')
)
df_columns

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,404,363,355,78,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,423,366,383,85,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,402,370,377,85,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,401,359,414,92,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,433,384,390,92,2.0
...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,s,s,s,s,92,1.0
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,s,s,s,s,85,4.0
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,s,s,s,s,85,1.0
476,79Q950,GED PLUS s CITYWIDE,8,400,426,496,92,2.0


## Change columns names

the columns names should be the same

In [164]:
df_columns.columns = df_columns.columns.str.lower().str.replace(' ', '_')

In [166]:
df_columns.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             478 non-null    object 
 1   school_name                     478 non-null    object 
 2   num_of_sat_test_takers          478 non-null    object 
 3   sat_math_avg._score             478 non-null    object 
 4   sat_writing_avg._score          478 non-null    object 
 5   sat_critical_readng_avg._score  478 non-null    object 
 6   pct_students_tested             478 non-null    int64  
 7   academic_tier_rating            392 non-null    float64
dtypes: float64(1), int64(1), object(6)
memory usage: 33.6+ KB


## Columns Formatting

In [240]:

cols_to_int = [
    "num_of_sat_test_takers",
    "sat_critical_readng_avg._score",
    "sat_math_avg._score",
    "sat_writing_avg._score"
]
for col in cols_to_int:
    df_columns[col] = (
        df_columns[col]
        .replace(["s", "S"], np.nan)
        .astype(float)
    )
    mean_val = df_columns[col].mean()
    df_columns[col] = df_columns[col].fillna(mean_val).round().astype('float')
df_columns
 

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_math_avg._score,sat_writing_avg._score,sat_critical_readng_avg._score,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,404.0,363.0,355.0,78,2.000000
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,423.0,366.0,383.0,85,3.000000
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,402.0,370.0,377.0,85,3.000000
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,401.0,359.0,414.0,92,4.000000
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,433.0,384.0,390.0,92,2.000000
...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,110.0,418.0,394.0,401.0,92,1.000000
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,110.0,418.0,394.0,401.0,85,4.000000
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,110.0,418.0,394.0,401.0,85,1.000000
476,79Q950,GED PLUS s CITYWIDE,8.0,400.0,426.0,496.0,92,2.000000


In [238]:
df_columns.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             478 non-null    object 
 1   school_name                     478 non-null    object 
 2   num_of_sat_test_takers          478 non-null    float64
 3   sat_math_avg._score             478 non-null    float64
 4   sat_writing_avg._score          478 non-null    float64
 5   sat_critical_readng_avg._score  478 non-null    float64
 6   pct_students_tested             478 non-null    int64  
 7   academic_tier_rating            478 non-null    float64
dtypes: float64(5), int64(1), object(2)
memory usage: 33.6+ KB


In [236]:
df_columns.describe()

Unnamed: 0,num_of_sat_test_takers,sat_math_avg._score,sat_writing_avg._score,sat_critical_readng_avg._score,pct_students_tested,academic_tier_rating
count,478.0,478.0,478.0,478.0,478.0,478.0
mean,110.282427,414.24477,393.987448,400.868201,84.692469,2.579082
std,145.945805,60.593784,55.02033,53.300987,4.94536,1.021312
min,6.0,312.0,286.0,279.0,78.0,1.0
25%,44.25,376.0,363.0,370.0,78.0,2.0
50%,68.0,403.5,387.0,397.5,85.0,2.579082
75%,110.0,426.75,403.0,411.75,85.0,3.0
max,1277.0,735.0,682.0,679.0,92.0,4.0


## Populating columns with scores below 200 and above 800

In [220]:
score_cols = [
    "sat_critical_readng_avg._score",
    "sat_math_avg._score",
    "sat_writing_avg._score"
]
for col in score_cols:
    df_columns.loc[
    (df_columns[col] < 200) | (df_columns[col] > 800), col
    ] = np.nan
    mean_val = df_columns[col].mean()
    df_columns[col] = df_columns[col].fillna(mean_val).round().astype('float')
df_columns

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_math_avg._score,sat_writing_avg._score,sat_critical_readng_avg._score,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,404.0,363.0,355.0,78,2.000000
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,423.0,366.0,383.0,85,3.000000
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,402.0,370.0,377.0,85,3.000000
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,401.0,359.0,414.0,92,4.000000
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,433.0,384.0,390.0,92,2.000000
...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,110.0,418.0,394.0,401.0,92,1.000000
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,110.0,418.0,394.0,401.0,85,4.000000
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,110.0,418.0,394.0,401.0,85,1.000000
476,79Q950,GED PLUS s CITYWIDE,8.0,400.0,426.0,496.0,92,2.000000


In [234]:
df_columns.isnull().sum()

dbn                               0
school_name                       0
num_of_sat_test_takers            0
sat_math_avg._score               0
sat_writing_avg._score            0
sat_critical_readng_avg._score    0
pct_students_tested               0
academic_tier_rating              0
dtype: int64

In [232]:
df_columns['academic_tier_rating']

0      2.000000
1      3.000000
2      3.000000
3      4.000000
4      2.000000
         ...   
473    1.000000
474    4.000000
475    1.000000
476    2.000000
477    2.579082
Name: academic_tier_rating, Length: 478, dtype: float64

## Fill NaN in 'academic_tier_rating'] with mean

In [230]:
df_columns['academic_tier_rating'] = (
    df_columns['academic_tier_rating']
    .fillna(df_columns['academic_tier_rating'].mean())
)

In [222]:
df_cleaned = df_columns.copy()
df_cleaned

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_math_avg._score,sat_writing_avg._score,sat_critical_readng_avg._score,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,404.0,363.0,355.0,78,2.000000
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,423.0,366.0,383.0,85,3.000000
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,402.0,370.0,377.0,85,3.000000
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,401.0,359.0,414.0,92,4.000000
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,433.0,384.0,390.0,92,2.000000
...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,110.0,418.0,394.0,401.0,92,1.000000
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,110.0,418.0,394.0,401.0,85,4.000000
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,110.0,418.0,394.0,401.0,85,1.000000
476,79Q950,GED PLUS s CITYWIDE,8.0,400.0,426.0,496.0,92,2.000000


In [228]:
df_cleaned.isnull().sum()

dbn                               0
school_name                       0
num_of_sat_test_takers            0
sat_math_avg._score               0
sat_writing_avg._score            0
sat_critical_readng_avg._score    0
pct_students_tested               0
academic_tier_rating              0
dtype: int64

## Export CSV

In [224]:
df_cleaned.to_csv('cleaned_sat_results.csv', index=False)

## Save to Database

In [None]:
df_cleaned.to_sql(
    name="darel-kigha_sat_results",
    con=engine,
    schema="nyc_schools",
    if_exists="replace",
    index=False
)