In [13]:
# Imports
import pandas as pd
from sqlalchemy import create_engine, inspect

In [14]:
# Create connection and read data
engine = create_engine('postgresql://neondb_owner:npg_CeS9fJg2azZD@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb?sslmode=require')

# Inspect schema for table names
inspector = inspect(engine)

# List schemas (optional)
schemas = inspector.get_schema_names()
print("Schemas:", schemas)

# List tables in a specific schema (e.g., "nyc_schools")
tables = inspector.get_table_names(schema="nyc_schools")
print("Tables in 'nyc_schools':", tables)


Schemas: ['dependency_example', 'information_schema', 'nyc_schools', 'public', 'test_berlin_data']
Tables in 'nyc_schools': ['school_safety_report', 'high_school_directory', 'sat_scores', 'school_demographics', 'anastasia_sat_results']


In [15]:
# Load data
data = pd.read_csv('/content/sat-results.csv')

# check data
display(data.head())
display(data.info())
display(f"Number of duplicates: {data.duplicated().sum()}")

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


None

'Number of duplicates: 15'

In [16]:
# Cleaning of column names
data.columns = data.columns.str.strip().str.lower().str.replace(" ","_").str.replace(".",'')

# Drop duplicates and repeated column
data = data.drop_duplicates()
data = data.drop(columns = "sat_critical_readng_avg_score" )

# Converting to numeric
numeric_col = ['num_of_sat_test_takers', 'sat_critical_reading_avg_score','sat_math_avg_score','sat_writing_avg_score']
for col in numeric_col:
  data[col] = pd.to_numeric(data[col], errors ='coerce')

# Remove % sign form column
data["pct_students_tested"] = data['pct_students_tested'].str.replace('%','').astype(float)

display(data.describe())
display(data.info())

Unnamed: 0,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,pct_students_tested,academic_tier_rating
count,421.0,421.0,421.0,421.0,478.0,363.0,392.0
mean,110.320665,400.850356,418.173397,393.985748,560082.717573,84.595041,2.579082
std,155.534254,56.802783,88.210494,58.635109,259637.064755,5.673305,1.128053
min,6.0,279.0,-10.0,286.0,101855.0,78.0,1.0
25%,41.0,368.0,372.0,360.0,337012.5,78.0,2.0
50%,62.0,391.0,395.0,381.0,581301.5,85.0,3.0
75%,95.0,416.0,438.0,411.0,778312.75,92.0,4.0
max,1277.0,679.0,1100.0,682.0,999398.0,92.0,4.0


<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             478 non-null    object 
 1   school_name                     478 non-null    object 
 2   num_of_sat_test_takers          421 non-null    float64
 3   sat_critical_reading_avg_score  421 non-null    float64
 4   sat_math_avg_score              421 non-null    float64
 5   sat_writing_avg_score           421 non-null    float64
 6   internal_school_id              478 non-null    int64  
 7   contact_extension               378 non-null    object 
 8   pct_students_tested             363 non-null    float64
 9   academic_tier_rating            392 non-null    float64
dtypes: float64(6), int64(1), object(3)
memory usage: 41.1+ KB


None

In [17]:
# Check for outliers (SAT is in range 200-800)
negativ_score = data[data['sat_math_avg_score'] < 200]
positiv_score = data[data['sat_math_avg_score'] > 800]
display(negativ_score)
display(positiv_score)

# Deal with outliers
data = data[data["sat_math_avg_score"] >= 200]
data['sat_math_avg_score'] = data["sat_math_avg_score"].clip(upper = 800)
data.reset_index(drop = True, inplace = True)

# Drop not useful columns
data = data.drop(columns =['contact_extension', 'internal_school_id'])

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
181,10X225,THEATRE ARTS PRODUCTION COMPANY SCHOOL,59.0,405.0,-10.0,394.0,827176,x345,78.0,


Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
80,03M415,WADLEIGH SECONDARY SCHOOL FOR THE PERFORMING &...,32.0,371.0,850.0,370.0,365679,,78.0,4.0
288,15K656,BROOKLYN HIGH SCHOOL OF THE ARTS,141.0,426.0,999.0,411.0,642190,,,
422,28Q470,JAMAICA HIGH SCHOOL,90.0,342.0,999.0,353.0,587904,x234,92.0,3.0
434,29Q283,PREPARATORY ACADEMY FOR WRITERS: A COLLEGE BOA...,43.0,370.0,1100.0,363.0,462173,x345,85.0,3.0


In [18]:
display(data.describe())
display(data.info())
display(data.head())

Unnamed: 0,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested,academic_tier_rating
count,420.0,420.0,420.0,420.0,316.0,352.0
mean,110.442857,400.840476,417.411905,393.985714,84.689873,2.585227
std,155.699512,56.870164,74.756051,58.705037,5.707029,1.118917
min,6.0,279.0,312.0,286.0,78.0,1.0
25%,41.0,368.0,372.0,360.0,78.0,2.0
50%,62.0,391.0,395.5,381.0,85.0,3.0
75%,95.5,416.25,438.5,411.0,92.0,4.0
max,1277.0,679.0,800.0,682.0,92.0,4.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             420 non-null    object 
 1   school_name                     420 non-null    object 
 2   num_of_sat_test_takers          420 non-null    float64
 3   sat_critical_reading_avg_score  420 non-null    float64
 4   sat_math_avg_score              420 non-null    float64
 5   sat_writing_avg_score           420 non-null    float64
 6   pct_students_tested             316 non-null    float64
 7   academic_tier_rating            352 non-null    float64
dtypes: float64(6), object(2)
memory usage: 26.4+ KB


None

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,78.0,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,92.0,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,92.0,2.0


In [None]:
# Add table
data.to_sql(name="sat_scores_mariia", con=engine, schema="nyc_schools",if_exists="replace",index=False)

# Load dataframe as csv
data.to_csv("cleaned_sat_results.csv", index=False)