## 🧮 Day 4 – Data Integration & Schema Design


🎯 Objective
Learn how to evaluate, clean, and integrate a real-world dataset into an existing PostgreSQL schema. 

You'll inspect the dataset, identify relational keys, clean inconsistencies, and write a Python-based script to append the data into the database.

### Data Inspection & Cleaning

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
##Psycopg2 connector

In [3]:
# SQLAlchemy connection string format:
# postgresql+psycopg2://user:password@host:port/dbname

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [4]:
# Read the CSV file into a DataFrame
df = pd.read_csv('/Users/root/Onbording_Tasks/sat-results.csv')
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [5]:
# Display the columns of the DataFrame
df.columns

Index(['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers',
       'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score',
       'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score',
       'internal_school_id', 'contact_extension', 'pct_students_tested',
       'academic_tier_rating'],
      dtype='object')

In [6]:
# Clean the column names by removing special characters and converting to lowercase
df.columns = [
    re.sub(r'[^a-zA-Z0-9_]', '', col.lower().strip().replace(' ', '_'))
    for col in df.columns
]
df.columns

Index(['dbn', 'school_name', 'num_of_sat_test_takers',
       'sat_critical_reading_avg_score', 'sat_math_avg_score',
       'sat_writing_avg_score', 'sat_critical_readng_avg_score',
       'internal_school_id', 'contact_extension', 'pct_students_tested',
       'academic_tier_rating'],
      dtype='object')

In [7]:
# Display the DataFrame information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             493 non-null    object 
 1   school_name                     493 non-null    object 
 2   num_of_sat_test_takers          493 non-null    object 
 3   sat_critical_reading_avg_score  493 non-null    object 
 4   sat_math_avg_score              493 non-null    object 
 5   sat_writing_avg_score           493 non-null    object 
 6   sat_critical_readng_avg_score   493 non-null    object 
 7   internal_school_id              493 non-null    int64  
 8   contact_extension               388 non-null    object 
 9   pct_students_tested             376 non-null    object 
 10  academic_tier_rating            402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


In [8]:
# Display number of duplicate rows in the DataFrame
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 15


In [9]:
# Display duplicate rows in the DataFrame
df[df.duplicated]

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
478,14K685,EL PUENTE ACADEMY FOR PEACE AND JUSTICE,28,359,335,341,359,688392,x345,92%,4.0
479,13K605,GEORGE WESTINGHOUSE CAREER AND TECHNICAL EDUCA...,85,406,391,392,406,937579,x234,,
480,27Q480,JOHN ADAMS HIGH SCHOOL,403,391,409,392,391,863765,,92%,1.0
481,07X221,SOUTH BRONX PREPARATORY: A COLLEGE BOARD SCHOOL,65,364,378,348,364,277389,x345,92%,
482,19K420,FRANKLIN K. LANE HIGH SCHOOL,s,s,s,s,s,134918,,78%,3.0
483,09X525,BRONX LEADERSHIP ACADEMY HIGH SCHOOL,114,384,394,388,384,906925,x234,92%,1.0
484,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0
485,17K543,"SCIENCE, TECHNOLOGY AND RESEARCH EARLY COLLEGE...",69,457,462,441,457,999398,x345,85%,
486,02M419,LANDMARK HIGH SCHOOL,62,390,399,381,390,166135,x123,78%,2.0
487,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0


In [10]:
# Drop duplicate rows from the DataFrame
df = df.drop_duplicates()
df

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0
...,...,...,...,...,...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,s,s,s,s,s,733698,x234,92%,1.0
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,s,s,s,s,s,976034,x345,,4.0
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,s,s,s,s,s,556924,x123,85%,1.0
476,79Q950,GED PLUS s CITYWIDE,8,496,400,426,496,544514,x234,92%,2.0


In [11]:
# Drop unnecessary columns from the DataFrame
df.drop(['sat_critical_readng_avg_score','contact_extension', 'internal_school_id'], axis=1, inplace=True)

In [12]:
# Convert all SAT columns to numeric
cols = ['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score','num_of_sat_test_takers']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [13]:
# Investigate the sat score columns for invalid values not in the range 200-800
df[
    ~df['sat_critical_reading_avg_score'].between(200, 800) |
    ~df['sat_math_avg_score'].between(200, 800) |
    ~df['sat_writing_avg_score'].between(200, 800)
]

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested,academic_tier_rating
22,02M392,MANHATTAN BUSINESS ACADEMY,,,,,85%,
23,02M393,BUSINESS OF SPORTS SCHOOL,,,,,,2.0
25,02M399,THE HIGH SCHOOL FOR LANGUAGE AND DIPLOMACY,,,,,85%,
38,02M427,MANHATTAN ACADEMY FOR ARTS & LANGUAGE,,,,,,4.0
40,02M437,HUDSON HIGH SCHOOL OF LEARNING TECHNOLOGIES,,,,,,2.0
...,...,...,...,...,...,...,...,...
470,75Q256,P.S. Q256,,,,,85%,1.0
472,75R025,SOUTH RICHMOND HIGH SCHOOL I.S./P.S. 25,,,,,78%,1.0
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,,,,,92%,1.0
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,,,,,,4.0


In [14]:
# Clipping the SAT math average score to valid range as negative values or values above 800 are not valid
df['sat_math_avg_score'] = df['sat_math_avg_score'].clip(lower=0)
df['sat_math_avg_score'] = df['sat_math_avg_score'].clip(upper=800)

In [15]:
# Clean the 'pct_students_tested' column by removing '%' character and converting to float
df['pct_students_tested'] = (
    df['pct_students_tested']
    .str.replace('%', '', regex=False)   # remove '%' character
    .replace(['N/A', 'NA', '', None], np.nan)  # convert invalids to NaN
    .astype(float)                       # convert to float
)

In [16]:
# Fill NaN values in df with 0
df = df.fillna(0)

In [17]:
# Display basic statistics of the DataFrame
df.describe()

Unnamed: 0,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested,academic_tier_rating
count,478.0,478.0,478.0,478.0,478.0,478.0
mean,97.165272,353.050209,366.763598,347.004184,64.242678,2.115063
std,150.270071,140.542605,153.374853,139.155359,36.532971,1.423575
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,358.25,362.25,351.0,78.0,1.0
50%,56.5,384.0,387.5,376.0,78.0,2.0
75%,89.0,411.75,432.0,403.0,85.0,3.0
max,1277.0,679.0,800.0,682.0,92.0,4.0


In [18]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('/Users/root/Onbording_Tasks/cleaned_sat-results.csv', index=False)

### Design the Schema

In [19]:
# Display the DataFrame information to understand the data types 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             478 non-null    object 
 1   school_name                     478 non-null    object 
 2   num_of_sat_test_takers          478 non-null    float64
 3   sat_critical_reading_avg_score  478 non-null    float64
 4   sat_math_avg_score              478 non-null    float64
 5   sat_writing_avg_score           478 non-null    float64
 6   pct_students_tested             478 non-null    float64
 7   academic_tier_rating            478 non-null    float64
dtypes: float64(6), object(2)
memory usage: 33.6+ KB


In [None]:
# SQL Query to create the table in DBeaver (optional)
"""
CREATE TABLE sat_scores (
    dbn VARCHAR(20) PRIMARY KEY,
    school_name VARCHAR(255),
    num_of_sat_test_takers FLOAT,
    sat_critical_reading_avg_score FLOAT,
    sat_math_avg_score FLOAT,
    sat_writing_avg_score FLOAT,
    pct_students_tested FLOAT,
    academic_tier_rating FLOAT
);
"""

'\nCREATE TABLE sat_scores (\n    dbn VARCHAR(20) PRIMARY KEY,\n    school_name VARCHAR(255),\n    num_of_sat_test_takers FLOAT,\n    sat_critical_reading_avg_score FLOAT,\n    sat_math_avg_score FLOAT,\n    sat_writing_avg_score FLOAT,\n    pct_students_tested FLOAT,\n    academic_tier_rating FLOAT\n);\n'

### Upload Data to DBeaver

### Code to upload data to engine

In [21]:
# Save the DataFrame to the PostgreSQL database
df.to_sql(
    name='jyoti_sat_results',
    con=engine,     
    schema='nyc_schools',
    if_exists='replace',    
    index=False            
)

478