In [4]:
# Preparing for work. Importing pandas, psycopg2, salalchemy and warnings.

import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

In [5]:
# Loading sat-results.csv into a data frame.

df = pd.read_csv('../day_4_datasets/sat-results.csv')
df

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...,...
488,27Q480,JOHN ADAMS HIGH SCHOOL,403,391,409,392,391,863765,,92%,1.0
489,13K605,GEORGE WESTINGHOUSE CAREER AND TECHNICAL EDUCA...,85,406,391,392,406,937579,x234,,
490,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
491,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0


In [6]:
# Cleaning duplicates

df = df.drop_duplicates()
df

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,s,s,s,s,s,733698,x234,92%,1.0
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,s,s,s,s,s,976034,x345,,4.0
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,s,s,s,s,s,556924,x123,85%,1.0
476,79Q950,GED PLUS s CITYWIDE,8,496,400,426,496,544514,x234,92%,2.0


In [7]:
# Cleaning column names. All lower case, no whitespaces at the start and end of a string,
# replacing space with underscore and removing dots.

new_column_names=[]

for column in df.columns:
        clean_name=column.lower().strip().replace(' ', '_').replace('.', '')
        new_column_names.append(clean_name)

new_column_names

['dbn',
 'school_name',
 'num_of_sat_test_takers',
 'sat_critical_reading_avg_score',
 'sat_math_avg_score',
 'sat_writing_avg_score',
 'sat_critical_readng_avg_score',
 'internal_school_id',
 'contact_extension',
 'pct_students_tested',
 'academic_tier_rating']

In [8]:
# Assigning new cleaned column names to the data frame and seeing if it worked

df.columns = new_column_names
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [9]:
# Removing duplicate column with a typo and seeing if it worked.

df = df.drop(columns=["sat_critical_readng_avg_score"])
df

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,s,s,s,s,733698,x234,92%,1.0
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,s,s,s,s,976034,x345,,4.0
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,s,s,s,s,556924,x123,85%,1.0
476,79Q950,GED PLUS s CITYWIDE,8,496,400,426,544514,x234,92%,2.0


In [10]:
# Capitalizing the first letter of each word in the school_name column (Title Case)

df['school_name'] = df['school_name'].str.title()


In [11]:
# Converting some columns to numeric values and removing anything else that is not a number. Also astype('Int64') makes columns to have INT instead of FLOAT.

columns_to_numeric = [
    'num_of_sat_test_takers',
    'sat_critical_reading_avg_score',
    'sat_math_avg_score',
    'sat_writing_avg_score'
]

df[columns_to_numeric] = df[columns_to_numeric].apply(pd.to_numeric, errors='coerce').astype('Int64')

In [12]:
# Removing all numeric values from num_of_sat_test_takers column that are less than 0 and making them NaN.

df["num_of_sat_test_takers"] = df["num_of_sat_test_takers"].mask(df["num_of_sat_test_takers"] < 0)

In [13]:
# Removing all numeric values that are less than 200 and more than 800 (SAT Scores) and making them NaN.

sat_score_columns = [
    'sat_critical_reading_avg_score',
    'sat_math_avg_score',
    'sat_writing_avg_score'
]

df[sat_score_columns] = df[sat_score_columns].mask((df[sat_score_columns] < 200) | (df[sat_score_columns] > 800))


In [14]:
# Removing 'contact_extension' column as it provides no useful information

df = df.drop(columns=["contact_extension"])

In [15]:
# Cleaning pct_students_tested column.

df['pct_students_tested'] = (
    df['pct_students_tested']
    .str.strip()                        # Remove any extra whitespace
    .str.replace('%', '', regex=False)  # Remove '%' sign
    .replace('N/A', pd.NA)              # Replace 'N/A' with actual missing value
    .astype('float') / 100              # Convert to float and scale to 0-1
)

In [16]:
# Converting academic_tier_rating column to INT instead of FLOAT.

df['academic_tier_rating'] = df['academic_tier_rating'].astype('Int64')

In [17]:
# Finally uploading the data frame as a table to SQL


# 1. Setting up a psycopg2 connection.

conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="npg_CeS9fJg2azZD",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)
cur = conn.cursor()


# 2. Setting up a SQLalchemy connection engine

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

engine = create_engine(DATABASE_URL)


# 3. Uploading data frame to SQL database

df.to_sql(
    name='armandas_sat_results',       
    con=engine,     
    schema='nyc_schools',
    if_exists='replace',    
    index=False            
)

478

In [18]:
# Creating a csv file from a final and clean data frame.

df.to_csv('cleaned_sat_results.csv', index=False)