# Day 4 task - Populating Database

## 0. Importing libraries

In [133]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import re
import numpy as np
import warnings
warnings.filterwarnings("ignore")

del df
del new_df

## 1. Read file

In [134]:
df = pd.read_csv("/Users/heikereichert/Downloads/sat-results.csv")
df

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...,...
488,27Q480,JOHN ADAMS HIGH SCHOOL,403,391,409,392,391,863765,,92%,1.0
489,13K605,GEORGE WESTINGHOUSE CAREER AND TECHNICAL EDUCA...,85,406,391,392,406,937579,x234,,
490,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
491,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0


## Explore the Dataset
- Identify which columns are useful and which are synthetic or dirty

### columns not useful or need revision:
- the doubled columns with the typo in the header
- contact extension
- pct_students tested, as it is a string. Should be re-calculated or re-formatted
- internal_school_id may be obsolete as DBN is a more general information
- the scores and number of test takers are strings but should be int


In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


## 2. Clean Data

## 2.1 normalize headers

In [137]:
new_df= df.copy()
new_df.columns = new_df.columns.str.replace(" ", "_")
new_df.columns = new_df.columns.str.lower()
new_df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in new_df.columns] # Remove special characters
new_df 

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...,...
488,27Q480,JOHN ADAMS HIGH SCHOOL,403,391,409,392,391,863765,,92%,1.0
489,13K605,GEORGE WESTINGHOUSE CAREER AND TECHNICAL EDUCA...,85,406,391,392,406,937579,x234,,
490,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
491,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0


## 2.2 transform string columns to numeric

In [138]:
cols_to_convert = ["sat_critical_reading_avg_score", "sat_math_avg_score", "sat_writing_avg_score"]
new_df["num_of_sat_test_takers"] = new_df["num_of_sat_test_takers"].apply(pd.to_numeric, errors='coerce').astype('Int64')
new_df[cols_to_convert] = new_df[cols_to_convert].apply(pd.to_numeric, errors='coerce')

new_df.info()  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             493 non-null    object 
 1   school_name                     493 non-null    object 
 2   num_of_sat_test_takers          435 non-null    Int64  
 3   sat_critical_reading_avg_score  435 non-null    float64
 4   sat_math_avg_score              435 non-null    float64
 5   sat_writing_avg_score           435 non-null    float64
 6   sat_critical_readng_avg_score   493 non-null    object 
 7   internal_school_id              493 non-null    int64  
 8   contact_extension               388 non-null    object 
 9   pct_students_tested             376 non-null    object 
 10  academic_tier_rating            402 non-null    float64
dtypes: Int64(1), float64(4), int64(1), object(5)
memory usage: 43.0+ KB


## 2.3 Remove duplicates

In [139]:
new_df=new_df.drop_duplicates()
new_df 

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355.0,404.0,363.0,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383.0,423.0,366.0,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377.0,402.0,370.0,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414.0,401.0,359.0,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390.0,433.0,384.0,390,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,,,,,s,733698,x234,92%,1.0
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,,,,,s,976034,x345,,4.0
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,,,,,s,556924,x123,85%,1.0
476,79Q950,GED PLUS s CITYWIDE,8,496.0,400.0,426.0,496,544514,x234,92%,2.0


### 15 duplicated records deleted

In [140]:
new_df.describe()

Unnamed: 0,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,academic_tier_rating
count,421.0,421.0,421.0,421.0,478.0,392.0
mean,110.320665,400.850356,418.173397,393.985748,560082.717573,2.579082
std,155.534254,56.802783,88.210494,58.635109,259637.064755,1.128053
min,6.0,279.0,-10.0,286.0,101855.0,1.0
25%,41.0,368.0,372.0,360.0,337012.5,2.0
50%,62.0,391.0,395.0,381.0,581301.5,3.0
75%,95.0,416.0,438.0,411.0,778312.75,4.0
max,1277.0,679.0,1100.0,682.0,999398.0,4.0


## 2.4 Handle outliers for averages


In [141]:
cols_to_convert = [ "sat_critical_reading_avg_score", "sat_math_avg_score", "sat_writing_avg_score"]
for col in cols_to_convert: 
    new_df.loc[(new_df[col] < 200) | (new_df[col] > 800), col] = np.nan
new_df.describe()

Unnamed: 0,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,academic_tier_rating
count,421.0,421.0,416.0,421.0,478.0,392.0
mean,110.320665,400.850356,413.733173,393.985748,560082.717573,2.579082
std,155.534254,56.802783,64.945638,58.635109,259637.064755,1.128053
min,6.0,279.0,312.0,286.0,101855.0,1.0
25%,41.0,368.0,372.0,360.0,337012.5,2.0
50%,62.0,391.0,395.0,381.0,581301.5,3.0
75%,95.0,416.0,437.25,411.0,778312.75,4.0
max,1277.0,679.0,735.0,682.0,999398.0,4.0


## 2.5 convert pct_students_tested to numeric

In [142]:
 
new_df["pct_students_tested"] = new_df["pct_students_tested"].str.rstrip('%').astype('float')
new_df.describe()  

Unnamed: 0,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,pct_students_tested,academic_tier_rating
count,421.0,421.0,416.0,421.0,478.0,363.0,392.0
mean,110.320665,400.850356,413.733173,393.985748,560082.717573,84.595041,2.579082
std,155.534254,56.802783,64.945638,58.635109,259637.064755,5.673305,1.128053
min,6.0,279.0,312.0,286.0,101855.0,78.0,1.0
25%,41.0,368.0,372.0,360.0,337012.5,78.0,2.0
50%,62.0,391.0,395.0,381.0,581301.5,85.0,3.0
75%,95.0,416.0,437.25,411.0,778312.75,92.0,4.0
max,1277.0,679.0,735.0,682.0,999398.0,92.0,4.0


## 2.6 finally dropping columns not required

In [143]:
# Columns to drop
cols_to_drop = ["sat_critical_readng_avg_score", "contact_extension","internal_school_id"]

# Drop multiple columns
new_df = new_df.drop(cols_to_drop, axis=1)
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             478 non-null    object 
 1   school_name                     478 non-null    object 
 2   num_of_sat_test_takers          421 non-null    Int64  
 3   sat_critical_reading_avg_score  421 non-null    float64
 4   sat_math_avg_score              416 non-null    float64
 5   sat_writing_avg_score           421 non-null    float64
 6   pct_students_tested             363 non-null    float64
 7   academic_tier_rating            392 non-null    float64
dtypes: Int64(1), float64(5), object(2)
memory usage: 34.1+ KB


## 3.1 Store new_df as CSV file

In [146]:
new_df.to_csv("/Users/heikereichert/Documents/Jupiter Notebooks/cleaned_sat_results.csv", index=False)  

## 3.2 store in database

In [147]:
# SQLAlchemy connection string format:
# postgresql+psycopg2://user:password@host:port/dbname

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [148]:
new_df.to_sql(
    name='heike_reichert_sat_results',       
    con=engine,     
    schema='nyc_schools',
    if_exists='replace',    
    index=False            
)

478