## 1. Setup and load Dataset

In [191]:
# Import necessary libraries
import pandas as pd
from sqlalchemy import create_engine

In [192]:
# Load dataset
df = pd.read_csv("sat-results.csv")
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [193]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


## 2. Clean Column Names

In [194]:
# Standardize column names
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(" ", "_")
              .str.replace("[^0-9a-zA-Z_]", "", regex=True)
)

In [195]:
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


## 3. Drop Unnecessary Columns

In [196]:
# Drop unnecessary columns 
df = df.drop(columns=[
    "sat_critical_readng_avg_score",  # duplicate with typo
    "internal_school_id",             # DBN already as the unique key
    "contact_extension"               # phone extension, many nulls, not useful
])

## 4. Convert Data Types

In [197]:
# Convert number of test takers to integer and replace non numeric values with NaN
df["num_of_sat_test_takers"] = pd.to_numeric(df["num_of_sat_test_takers"], errors="coerce").astype('Int64')

In [198]:
# Convert numeric columns with scores to float and replace non numeric values with NaN
score_cols = [
    "sat_critical_reading_avg_score",
    "sat_math_avg_score",
    "sat_writing_avg_score"
]
df[score_cols] = df[score_cols].apply(pd.to_numeric, errors="coerce")

## 5. Validate and Clean Scores

In [199]:
# Set to NaN invalid SAT scores (<200 or >800)

for col in score_cols:
    df.loc[(df[col] < 200) | (df[col] > 800), col] = None

In [200]:
# Strip %, convert to float, handle "N/A" as NaN
df["pct_students_tested"] = (
    df["pct_students_tested"]
    .replace("N/A", None)
    .str.replace("%", "", regex=False)
    .astype(float)
)

In [201]:
# Convert data type to integer for performance tier (scale 1–4)
df["academic_tier_rating"] = pd.to_numeric(df["academic_tier_rating"], errors="coerce").astype('Int64')

## 6. Additional column

In [202]:
# Additional column with average total score
df["sat_total_avg_score"] = df[score_cols].sum(axis=1, skipna=True)

In [203]:

# Final check
print(df.dtypes)
print(df.head())

dbn                                object
school_name                        object
num_of_sat_test_takers              Int64
sat_critical_reading_avg_score    float64
sat_math_avg_score                float64
sat_writing_avg_score             float64
pct_students_tested               float64
academic_tier_rating                Int64
sat_total_avg_score               float64
dtype: object
      dbn                                    school_name  \
0  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448            UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                     EAST SIDE COMMUNITY SCHOOL   
3  01M458                      FORSYTH SATELLITE ACADEMY   
4  01M509                        MARTA VALLE HIGH SCHOOL   

   num_of_sat_test_takers  sat_critical_reading_avg_score  sat_math_avg_score  \
0                      29                           355.0               404.0   
1                      91                           383.0               423.0   
2   

## 7. Save Cleaned CSV and Database Integration

In [204]:
# Save cleaned version
df.to_csv("cleaned_sat_results.csv", index=False)

In [205]:
# SQLAlchemy connection string format:

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [206]:
# Insert into database
df.to_sql("serhii_sotnichenko_sat_scores", engine, if_exists="replace", index=False)
print("Data uploaded")

Data uploaded


# Day 4 – NYC SAT Results Data Cleaning & Integration

## Cleaning Logic
- Standardized column names: lowercase, underscores instead of spaces, removed special characters.  
- Dropped unnecessary columns:
  - `sat_critical_readng_avg_score` (duplicate with typo)  
  - `internal_school_id` (DBN already unique identifier)  
  - `contact_extension` (mostly null, not relevant)  
- Converted numeric columns:
  - `num_of_sat_test_takers` → integer (`Int64`)  
  - SAT score columns (`sat_critical_reading_avg_score`, `sat_math_avg_score`, `sat_writing_avg_score`) → float  
- Set invalid SAT scores (<200 or >800) to NaN.  
- Converted `pct_students_tested` from strings with "%" to float, handling "N/A" as NaN.  
- Converted `academic_tier_rating` to integer (`Int64`).  
- Added `sat_total_avg_score` as the sum of individual SAT score columns.

## Challenges
- Some columns contained non-numeric values like `"s"` or `"N/A"` which required coercion to numeric and replacement with NaN.  
- Handling duplicates and typos in column names to avoid conflicts in analysis.  
- Ensuring correct data types for integration with PostgreSQL (integer vs float).

## SQL Schema / Integration Notes
- Created table: `serhii_sotnichenko_sat_scores`  
```sql
CREATE TABLE serhii_sotnichenko_sat_scores (
    dbn TEXT PRIMARY KEY,
    school_name TEXT,
    num_of_sat_test_takers INT,
    sat_critical_reading_avg_score FLOAT,
    sat_math_avg_score FLOAT,
    sat_writing_avg_score FLOAT,
    sat_total_avg_score FLOAT,
    pct_students_tested FLOAT,
    academic_tier_rating INT
);

- Data inserted using SQLAlchemy `to_sql` with `if_exists="replace"`.  
- DBN used as primary key for relational integrity with existing tables like `high_school_directory`, `school_safety_report` and `school_demographics`.  
- Scores cleaned and validated to ensure accurate analytics and reporting.