# 🧠 Day 4

## 🔌 Step 1: Import Libraries

In [36]:
import pandas as pd
import numpy as np
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine



pd.set_option('display.max_columns', None)


In [2]:
!pip install sqlalchemy psycopg2-binary


Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10


## 🔐 Step 2: Connect to the Database

In [None]:
""" DB connection setup using hardcoded credentials (for onboarding only)
conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="npg_CeS9fJg2azZD",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)
cur = conn.cursor()"""

In [34]:
from sqlalchemy import create_engine

# Format: 'postgresql+psycopg2://user:password@host:port/dbname'
db_url = 'postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb'

engine = create_engine(db_url, connect_args={"sslmode": "require"})


In [12]:
df = pd.read_csv('sat-results.csv')
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [13]:
print(df.columns.to_list())

['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


In [14]:
# First, compare how often the values are equal
comparison = df["SAT Critical Reading Avg. Score"] == df["SAT Critical Readng Avg. Score"]
match_ratio = comparison.sum() / len(df)
print(f"Match ratio: {match_ratio:.2%}")

Match ratio: 100.00%


In [15]:
#Now we can safely drop the column with the typo and two other columns which are not meaningful:
df = df.drop(columns=['SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension'])

#Clean column names:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [16]:
dbn_counts = df['dbn'].value_counts()
duplicates = dbn_counts[dbn_counts > 1]
print(f"Number of duplicated DBNs: {len(duplicates)}")

Number of duplicated DBNs: 10


In [17]:
duplicate_rows = df[df['dbn'].isin(duplicates.index)]
display(duplicate_rows.sort_values('dbn'))

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg._score,sat_math_avg._score,sat_writing_avg._score,pct_students_tested,academic_tier_rating
35,02M419,LANDMARK HIGH SCHOOL,62,390,399,381,78%,2.0
486,02M419,LANDMARK HIGH SCHOOL,62,390,399,381,78%,2.0
52,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,92%,2.0
484,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,92%,2.0
491,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,92%,2.0
99,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,78%,2.0
490,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,78%,2.0
487,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,78%,2.0
481,07X221,SOUTH BRONX PREPARATORY: A COLLEGE BOARD SCHOOL,65,364,378,348,92%,
492,07X221,SOUTH BRONX PREPARATORY: A COLLEGE BOARD SCHOOL,65,364,378,348,92%,


In [18]:
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Dropped rows: {before - after}")

Dropped rows: 15


It means some DBNs were duplicated more than once — i.e., some schools appeared three times instead of just twice.

So, in total, 15 fully identical rows were removed.

In [19]:
print(df.columns.tolist())

['dbn', 'school_name', 'num_of_sat_test_takers', 'sat_critical_reading_avg._score', 'sat_math_avg._score', 'sat_writing_avg._score', 'pct_students_tested', 'academic_tier_rating']


In [21]:
sat_columns = [
    'sat_critical_reading_avg._score',
    'sat_math_avg._score',
    'sat_writing_avg._score'
]
for col in sat_columns:
    print(f"\nUnique values in {col}:")
    print(df[col].value_counts())

for col in sat_columns:
    num_nans = df[col].isna().sum()
    print(f"{col}: {num_nans} missing (NaN) values")


Unique values in sat_critical_reading_avg._score:
sat_critical_reading_avg._score
s      57
367     8
384     8
370     8
398     8
       ..
472     1
437     1
466     1
635     1
428     1
Name: count, Length: 164, dtype: int64

Unique values in sat_math_avg._score:
sat_math_avg._score
s       57
385      9
381      8
371      8
364      8
        ..
1100     1
488      1
682      1
317      1
444      1
Name: count, Length: 177, dtype: int64

Unique values in sat_writing_avg._score:
sat_writing_avg._score
s      57
370     9
368     9
394     8
359     8
       ..
466     1
417     1
470     1
397     1
422     1
Name: count, Length: 163, dtype: int64
sat_critical_reading_avg._score: 0 missing (NaN) values
sat_math_avg._score: 0 missing (NaN) values
sat_writing_avg._score: 0 missing (NaN) values


In [22]:
print(f"Initial shape: {df.shape}")
df = df[~df[sat_columns].isin(['s']).any(axis=1)]
for col in sat_columns:
    df[col] = pd.to_numeric(df[col])
for col in sat_columns:
    min_val = df[col].min()
    max_val = df[col].max()
    print(f"{col}: min={min_val}, max={max_val}")

Initial shape: (478, 8)
sat_critical_reading_avg._score: min=279, max=679
sat_math_avg._score: min=-10, max=1100
sat_writing_avg._score: min=286, max=682


In [23]:
for col in sat_columns:
    df = df[(df[col] >= 200) & (df[col] <= 800)]
print(f"After dropping 's' rows and out of range scores: {df.shape}")

After dropping 's' rows and out of range scores: (416, 8)


In [26]:
print(df['pct_students_tested'].value_counts(dropna=False))

pct_students_tested
78%    111
85%    105
NaN    103
92%     97
Name: count, dtype: int64


In [24]:
def pct_to_float(x):
    if isinstance(x, str) and x.endswith('%'):
        return float(x.strip('%')) / 100
    else:
        return np.nan

df['pct_students_tested'] = df['pct_students_tested'].apply(pct_to_float)


In [25]:
print(df['pct_students_tested'].describe())
print(df['pct_students_tested'].isna().sum())


count    313.000000
mean       0.846869
std        0.057069
min        0.780000
25%        0.780000
50%        0.850000
75%        0.920000
max        0.920000
Name: pct_students_tested, dtype: float64
103


In [26]:
print(df['academic_tier_rating'].value_counts(dropna=False))

academic_tier_rating
4.0    98
2.0    89
3.0    84
1.0    78
NaN    67
Name: count, dtype: int64


In [27]:
df['num_of_sat_test_takers'] = pd.to_numeric(
    df['num_of_sat_test_takers'],
    errors='coerce'
)

missing_test_takers = df['num_of_sat_test_takers'].isna().sum()
print(f"Missing values in num_of_sat_test_takers after conversion: {missing_test_takers}")



Missing values in num_of_sat_test_takers after conversion: 0


In [28]:
df['num_of_sat_test_takers'] = df['num_of_sat_test_takers'].astype(int)

In [35]:
print(df.dtypes)

dbn                                 object
school_name                         object
num_of_sat_test_takers               int64
sat_critical_reading_avg._score      int64
sat_math_avg._score                  int64
sat_writing_avg._score               int64
pct_students_tested                float64
academic_tier_rating               float64
dtype: object


In [29]:
# Save cleaned dataset
df.to_csv("cleaned_sat_results.csv", index=False)
print("✅ Cleaned dataset saved as cleaned_sat_results.csv")


✅ Cleaned dataset saved as cleaned_sat_results.csv


In [38]:
tables_df = pd.read_sql("""
    SELECT table_schema, table_name
    FROM information_schema.tables
    WHERE table_schema NOT IN ('information_schema', 'pg_catalog')
    ORDER BY table_schema, table_name;
""", engine)

tables_df

Unnamed: 0,table_schema,table_name
0,dependency_example,departments
1,dependency_example,districts
2,dependency_example,employees
3,dependency_example,neighborhoods
4,nyc_schools,Levon_cleaned_sat_scores
5,nyc_schools,anastasia_sat_results
6,nyc_schools,clara_sat_results
7,nyc_schools,cleaned_sat_results_anna
8,nyc_schools,deepshikha_sat_results
9,nyc_schools,giovani_sat_results


In [37]:
df.to_sql(
    "cleaned_sat_results_anna",  # table name
    con=engine,
    schema="nyc_schools",
    if_exists="replace",         # replace if you rerun
    index=False,
    dtype={
        "dbn":                  sqlalchemy.types.Text(),
        "school_name":          sqlalchemy.types.Text(),
        "num_of_sat_test_takers": sqlalchemy.types.Integer(),
        "sat_critical_reading_avg._score": sqlalchemy.types.Integer(),
        "sat_math_avg._score": sqlalchemy.types.Integer(),
        "sat_writing_avg._score": sqlalchemy.types.Integer(),
        "pct_students_tested": sqlalchemy.types.Float(),
        "academic_tier_rating": sqlalchemy.types.Float()
    }
)

416