In [None]:
import os
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine

In [2]:
# read csv file
df_sat = pd.read_csv(os.path.join("day_4_datasets", "sat-results.csv"))
df_sat.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


## 📋 1. Overview of Dataset

This code provides a high-level assessment of the SAT dataset (`df_sat`) to understand its structure, completeness, and basic statistics.  

**Key observations:**

- **Dataset Shape:** 493 rows × 11 columns  
- **Column Names and Data Types:**  
  - Most SAT score and test taker columns are `object` type (need conversion to numeric)  
  - `academic_tier_rating` is numeric (`float64`)  
- **Missing Values:**  
  - `contact_extension`: 105 missing  
  - `pct_students_tested`: 117 missing  
  - `academic_tier_rating`: 91 missing  
- **Unique Identifiers:**  
  - DBN: 478 unique values (some duplicates exist)  
  - Total rows: 493  

✅ *Result:* This overview highlights columns requiring type conversion, handling of missing data, and attention to duplicate DBNs before analysis.


In [3]:
print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)
print(f"Shape: {df_sat.shape}")
print(f"\nColumn Names:\n{df_sat.columns.tolist()}")
print(f"\nData Types:\n{df_sat.dtypes}")


DATASET OVERVIEW
Shape: (493, 11)

Column Names:
['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']

Data Types:
DBN                                 object
SCHOOL NAME                         object
Num of SAT Test Takers              object
SAT Critical Reading Avg. Score     object
SAT Math Avg. Score                 object
SAT Writing Avg. Score              object
SAT Critical Readng Avg. Score      object
internal_school_id                   int64
contact_extension                   object
pct_students_tested                 object
academic_tier_rating               float64
dtype: object


In [4]:
print(f"\nBasic Statistics:\n{df_sat.describe()}")
print(f"\nMissing Values:\n{df_sat.isnull().sum()}")
print(f"\nUnique values in key columns:")
print(f"Unique DBNs: {df_sat['DBN'].nunique()}")
print(f"Total rows: {len(df_sat)}")


Basic Statistics:
       internal_school_id  academic_tier_rating
count          493.000000            402.000000
mean        562172.943205              2.564677
std         262138.627055              1.126443
min         101855.000000              1.000000
25%         332013.000000              2.000000
50%         587220.000000              3.000000
75%         782993.000000              4.000000
max         999398.000000              4.000000

Missing Values:
DBN                                  0
SCHOOL NAME                          0
Num of SAT Test Takers               0
SAT Critical Reading Avg. Score      0
SAT Math Avg. Score                  0
SAT Writing Avg. Score               0
SAT Critical Readng Avg. Score       0
internal_school_id                   0
contact_extension                  105
pct_students_tested                117
academic_tier_rating                91
dtype: int64

Unique values in key columns:
Unique DBNs: 478
Total rows: 493


## 2. 📊 Data Assessment and Summary

This section performs a systematic review of each key column in the SAT dataset (`df_sat`) to identify data quality issues and understand the dataset structure.  

**Key assessments and findings:**

1. **DBN (Primary Key)**  
   - Total unique DBNs: 478  
   - Duplicate DBNs: 15  
   - Missing DBNs: 0  
   *→ Indicates some duplicates exist and may need handling.*

2. **SAT Score Columns** (`Critical Reading`, `Math`, `Writing`)  
   - Data type: `object` (needs conversion to numeric)  
   - 's' (suppressed) values present: 58 in each column  
   - Sample scores show a wide range of values  
   *→ Numeric conversion and handling of suppressed data required.*

3. **Percentage of Students Tested (`pct_students_tested`)**  
   - Data type: `object` with `%` symbols  
   - Some missing values (`NaN`)  
   *→ Needs cleaning and numeric conversion.*

4. **Academic Tier Rating (`academic_tier_rating`)**  
   - Unique values: 1, 2, 3, 4, with 91 missing values  
   *→ Missing data may require imputation or exclusion.*

5. **Duplicate Column Check**  
   - `'SAT Critical Reading Avg. Score'` and `'SAT Critical Readng Avg. Score'` are identical  
   *→ Confirms presence of a duplicate column that can be safely dropped.*

✅ *Result:* This assessment highlights key areas for cleaning: type conversion, missing value handling, duplicate removal, and preparing numeric SAT scores for analysis.


In [5]:
# Column-by-Column Assessment
print("Starting column-by-column assessment...")

# 1. DBN (Primary Key)
dbn_unique = df_sat['DBN'].nunique()
dbn_dupes = df_sat['DBN'].duplicated().sum()
dbn_nulls = df_sat['DBN'].isnull().sum()
print(f"\n1. DBN — Unique: {dbn_unique}, Duplicates: {dbn_dupes}, Nulls: {dbn_nulls}")

# 2. SAT Scores
sat_columns = [
    'SAT Critical Reading Avg. Score',
    'SAT Math Avg. Score',
    'SAT Writing Avg. Score'
]

print("\n2. SAT Score Columns Overview:")
for col in sat_columns:
    print(f" - {col}: dtype={df_sat[col].dtype}, contains 's'={(df_sat[col] == 's').sum()}")
    print(f"   Sample values: {df_sat[col].unique()[:10]}")

# 3. Percentage of Students Tested
print("\n3. pct_students_tested Overview:")
print(f" - dtype={df_sat['pct_students_tested'].dtype}")
print(f"   Sample values: {df_sat['pct_students_tested'].unique()[:10]}")

# 4. Academic Tier Rating
print("\n4. academic_tier_rating Overview:")
print(f" - Unique values: {df_sat['academic_tier_rating'].unique()}")
print(f" - Null count: {df_sat['academic_tier_rating'].isnull().sum()}")

# 5. Duplicate Column Check
col1 = 'SAT Critical Reading Avg. Score'
col2 = 'SAT Critical Readng Avg. Score'
if col2 in df_sat.columns:
    same_values = (df_sat[col1] == df_sat[col2]).all()
    print(f"\n5. Duplicate Column Check: {col1} == {col2} → {same_values}")
else:
    print(f"\n5. Duplicate Column Check: {col2} not found in dataset")


Starting column-by-column assessment...

1. DBN — Unique: 478, Duplicates: 15, Nulls: 0

2. SAT Score Columns Overview:
 - SAT Critical Reading Avg. Score: dtype=object, contains 's'=58
   Sample values: ['355' '383' '377' '414' '390' '332' '522' '417' '624' '395']
 - SAT Math Avg. Score: dtype=object, contains 's'=58
   Sample values: ['404' '423' '402' '401' '433' '557' '574' '418' '604' '400']
 - SAT Writing Avg. Score: dtype=object, contains 's'=58
   Sample values: ['363' '366' '370' '359' '384' '316' '525' '411' '628' '387']

3. pct_students_tested Overview:
 - dtype=object
   Sample values: ['78%' nan '92%' '85%']

4. academic_tier_rating Overview:
 - Unique values: [ 2.  3.  4. nan  1.]
 - Null count: 91

5. Duplicate Column Check: SAT Critical Reading Avg. Score == SAT Critical Readng Avg. Score → True


### 2.1 🧹 Data Cleaning 

This code performs a structured cleaning process on the **SAT dataset (`df_sat`)** to prepare it for analysis.  
Key cleaning tasks include:

- **Standardized column names** → lowercased, underscores instead of spaces, and punctuation removed.  
- **Removed duplicate/irrelevant columns** → such as typo columns and administrative fields.  
- **Handled suppressed data (‘s’ values)** → replaced with `NaN` and converted to numeric types.  
- **Validated SAT score ranges** → ensured all scores fall within 200–800.  
- **Cleaned percentage fields** → stripped `%` symbols and converted to numbers.  
- **Fixed test-taker counts** → converted to numeric and replaced invalid entries.  
- **Removed duplicate schools** → kept only the first record per `dbn`.  
- **Dropped rows with missing school identifiers** → ensured each record represents a valid school.  

✅ *Result:* A clean, consistent dataset (`df_clean`) ready for further exploration and analysis.


In [6]:

# Create a copy for cleaning
df_clean = df_sat.copy()

print("Starting data cleaning...")

# 1. Standardize column names
df_clean.columns = (
    df_clean.columns
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('.', '', regex=False)
)

# 2. Drop duplicate/typo column if it exists
if 'sat_critical_readng_avg_score' in df_clean.columns:
    df_clean.drop('sat_critical_readng_avg_score', axis=1, inplace=True)

# 3. Drop unnecessary columns
df_clean.drop(['internal_school_id', 'contact_extension'], axis=1, inplace=True)

# 4. Handle 's' values in SAT scores
sat_cols = ['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score']
for col in sat_cols:
    df_clean[col] = (
        pd.to_numeric(df_clean[col].replace('s', np.nan), errors='coerce')
    )

# 5. Validate SAT score ranges (200–800)
for col in sat_cols:
    invalid = (df_clean[col] < 200) | (df_clean[col] > 800)
    df_clean.loc[invalid, col] = np.nan

# 6. Clean pct_students_tested
df_clean['pct_students_tested'] = (
    df_clean['pct_students_tested']
    .astype(str)
    .str.replace('%', '')
    .str.strip()
)
df_clean['pct_students_tested'] = pd.to_numeric(df_clean['pct_students_tested'], errors='coerce')

# 7. Clean num_of_sat_test_takers
df_clean['num_of_sat_test_takers'] = pd.to_numeric(
    df_clean['num_of_sat_test_takers'].replace('s', np.nan),
    errors='coerce'
)

# 8. Drop duplicate DBNs (keep first)
df_clean.drop_duplicates(subset='dbn', keep='first', inplace=True)

# 9. Remove rows with missing DBN
df_clean = df_clean[df_clean['dbn'].notna()]

print(f"Cleaning complete. Final shape: {df_clean.shape}")


Starting data cleaning...
Cleaning complete. Final shape: (478, 8)


In [18]:
df_clean.head()

print(df_clean.info())

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             478 non-null    object 
 1   school_name                     478 non-null    object 
 2   num_of_sat_test_takers          421 non-null    float64
 3   sat_critical_reading_avg_score  421 non-null    float64
 4   sat_math_avg_score              416 non-null    float64
 5   sat_writing_avg_score           421 non-null    float64
 6   pct_students_tested             363 non-null    float64
 7   academic_tier_rating            392 non-null    float64
dtypes: float64(6), object(2)
memory usage: 33.6+ KB
None


## 3. 🏗️ Design the Schema

This code selects and organizes the final set of columns to create a clean dataset (`df_final`) ready for analysis or upload.  

**Key steps and results:**

- **Columns selected:** 8 essential fields including `dbn` (primary key), school name, number of SAT test takers, SAT scores (reading, math, writing), percentage of students tested, and academic tier rating.  
- **Final dataset shape:** 478 rows × 8 columns  
- **Sample data:** Shows numeric SAT scores, some missing values in `pct_students_tested`, and tier ratings.  
- **Purpose:** Ensures only relevant, cleaned, and standardized data is kept for downstream analysis or database upload.  

✅ *Result:* A compact, structured dataset with cleaned SAT data and school identifiers, ready for further use.


In [7]:
# Define final columns to keep
final_columns = [
    'dbn',                              # Primary/Foreign key
    'school_name',                      # School name
    'num_of_sat_test_takers',           # Number of test takers
    'sat_critical_reading_avg_score',   # Reading score
    'sat_math_avg_score',               # Math score
    'sat_writing_avg_score',            # Writing score
    'pct_students_tested',              # Percentage tested
    'academic_tier_rating'              # Performance tier
]

# Keep only these columns
df_final = df_clean[final_columns].copy()

# Display final dataset overview
print("Final dataset ready for upload:")
print(f"Shape: {df_final.shape}")
print(f"Columns: {df_final.columns.tolist()}")

Final dataset ready for upload:
Shape: (478, 8)
Columns: ['dbn', 'school_name', 'num_of_sat_test_takers', 'sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score', 'pct_students_tested', 'academic_tier_rating']


In [8]:
print("\nSample data:")
print(df_final.head())



Sample data:
      dbn                                    school_name  \
0  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448            UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                     EAST SIDE COMMUNITY SCHOOL   
3  01M458                      FORSYTH SATELLITE ACADEMY   
4  01M509                        MARTA VALLE HIGH SCHOOL   

   num_of_sat_test_takers  sat_critical_reading_avg_score  sat_math_avg_score  \
0                    29.0                           355.0               404.0   
1                    91.0                           383.0               423.0   
2                    70.0                           377.0               402.0   
3                     7.0                           414.0               401.0   
4                    44.0                           390.0               433.0   

   sat_writing_avg_score  pct_students_tested  academic_tier_rating  
0                  363.0                 78.0                   2.0 

In [9]:
# drop rows where any of the SAT score columns are missing
df_final.dropna(subset=['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score'], inplace=True)

## 4. 🏗️ Create Database Table

This step creates a PostgreSQL table to store the cleaned SAT dataset (`df_final`).  

**Key actions:**

- **Connect to the database:** Establishes a secure connection to the Neon PostgreSQL instance.  
- **Define table schema:**  
  - `dbn` as primary key  
  - `school_name`, `num_of_sat_test_takers`, and SAT scores as numeric fields  
  - `pct_students_tested` and `academic_tier_rating` as decimal fields  
  - `created_at` timestamp for record creation  
  - Foreign key on `dbn` referencing `high_school_directory`  
- **Execute table creation:** Uses SQL `CREATE TABLE IF NOT EXISTS` to safely create the table only if it does not exist.  
- **Error handling:** Commits the table if successful, rolls back if an error occurs, and ensures the connection is closed.  

✅ *Result:* A structured database table ready to receive cleaned SAT data for analysis or reporting.


In [11]:
# Connect to PostgreSQL database
conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="a9Am7Yy5r9_T7h4OF2GN",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)

# SQL query to create the SAT scores table
create_table_query = """
CREATE TABLE IF NOT EXISTS nuzhat_amna_sat_scores (
    dbn VARCHAR(10) PRIMARY KEY,
    school_name VARCHAR(255),
    num_of_sat_test_takers INTEGER,
    sat_critical_reading_avg_score INTEGER,
    sat_math_avg_score INTEGER,
    sat_writing_avg_score INTEGER,
    pct_students_tested DECIMAL(5,2),
    academic_tier_rating DECIMAL(3,1),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY (dbn) REFERENCES nyc_schools.high_school_directory(dbn)
);
"""

# Execute table creation
try:
    with conn.cursor() as cur:
        cur.execute(create_table_query)
        conn.commit()
        print("✅ Table created successfully!")
except Exception as e:
    print(f"❌ Error creating table: {e}")
    conn.rollback()
finally:
    conn.close()


✅ Table created successfully!


### 4.1 🚀 Insert Data into Database

This step inserts the cleaned SAT dataset (`df_final`) into the PostgreSQL table `nuzhat_amna_sat_scores`.  

**Key actions:**

- Uses **parameterized queries** to safely insert data and prevent SQL injection.  
- Converts `NaN` values to SQL `NULL` for proper storage.  
- Handles **conflicts on `dbn`** by updating existing records with new values.  
- Tracks successful inserts and errors, providing a summary at the end.  
- Ensures the **database connection is closed** after insertion.  

✅ *Result:* All cleaned SAT data is uploaded to the database, ready for queries and analysis.


In [None]:
# Step 1: Reconnect to database
conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="a9Am7Yy5r9_T7h4OF2GN",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)

# Step 2: Get list of existing DBNs from nyc_schools.high_school_directory
print("Fetching existing DBNs from nyc_schools.high_school_directory...")
existing_dbns_query = "SELECT dbn FROM nyc_schools.high_school_directory"
existing_dbns = pd.read_sql(existing_dbns_query, conn)['dbn'].tolist()

print(f"Found {len(existing_dbns)} schools in nyc_schools.high_school_directory")

# Step 3: Filter df_clean to only include matching DBNs
print("\nFiltering SAT data to match existing schools...")
df_final = df_clean[df_clean['dbn'].isin(existing_dbns)].copy()

# Step 6: Insert the filtered data using pandas to_sql()
engine = create_engine(
    'postgresql://neondb_owner:a9Am7Yy5r9_T7h4OF2GN@'
    'ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb'
)

try:
    print("Inserting data into nuzhat_amna_sat_scores...")
    
    df_final.to_sql(
        name='nuzhat_amna_sat_scores',
        con=engine,
        if_exists='append',  # Use 'replace' if you want to drop/recreate table
        index=False,
        method='multi',
        chunksize=100
    )
    
    print(f"✅ Successfully inserted {len(df_final)} rows!")
    
except Exception as e:
    print(f"❌ Error during insertion: {e}")
    raise

finally:
    engine.dispose()
    conn.close()
    print("✅ Database connection closed")

Fetching existing DBNs from nyc_schools.high_school_directory...


  existing_dbns = pd.read_sql(existing_dbns_query, conn)['dbn'].tolist()


Found 435 schools in nyc_schools.high_school_directory

Filtering SAT data to match existing schools...
Inserting data into nuzhat_amna_sat_scores...
✅ Successfully inserted 369 rows!
✅ Database connection closed


# Test query: SAT scores by borough

In [16]:
# Test query: SAT scores by borough
test_query = """
SELECT 
    hsd.borough,
    COUNT(*) as num_schools,
    ROUND(AVG(sat.sat_math_avg_score), 1) as avg_math_score,
    ROUND(AVG(sat.sat_critical_reading_avg_score), 1) as avg_reading_score
FROM nuzhat_amna_sat_scores sat
INNER JOIN nyc_schools.high_school_directory hsd ON sat.dbn = hsd.dbn
GROUP BY hsd.borough
ORDER BY avg_math_score DESC;
"""

result = pd.read_sql(test_query, engine)
print("\nSAT Scores by Borough:")
print(result)


SAT Scores by Borough:
         borough  num_schools  avg_math_score  avg_reading_score
0  Staten Island           10           472.5              457.5
1         Queens           65           452.1              423.9
2      Manhattan           91           442.7              425.4
3       Brooklyn          108           403.2              390.4
4          Bronx           95           391.9              382.5


In [19]:
# save 
df_final.to_csv("cleaned_sat_results.csv", index=False)


In [22]:
# load df_loaded = pd.read_csv("cleaned_sat_results.csv")
df_loaded = pd.read_csv("cleaned_sat_results.csv")
print(df_loaded.head())
print(df_loaded.shape)

      dbn                                    school_name  \
0  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448            UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                     EAST SIDE COMMUNITY SCHOOL   
3  01M458                      FORSYTH SATELLITE ACADEMY   
4  01M509                        MARTA VALLE HIGH SCHOOL   

   num_of_sat_test_takers  sat_critical_reading_avg_score  sat_math_avg_score  \
0                    29.0                           355.0               404.0   
1                    91.0                           383.0               423.0   
2                    70.0                           377.0               402.0   
3                     7.0                           414.0               401.0   
4                    44.0                           390.0               433.0   

   sat_writing_avg_score  pct_students_tested  academic_tier_rating  
0                  363.0                 78.0                   2.0  
1           