# Onboarding Day 4 - Data Cleaning & Exporting

In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

## Data Connection & Loading

In [2]:
# Database URL format: dialect+driver://username:password@host:port/dbname
DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:a9Am7Yy5r9_T7h4OF2GN@"                  # database postgresql with psycopg2 driver + username and password + @ (separate host and port)
    "ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"    # host + port + /database name
    "?sslmode=require"                                                          # SSL mode
)

# Create SQLAlchemy engine
engine = create_engine(DATABASE_URL)

In [3]:
df = pd.read_csv('day_4_datasets/sat-results.csv')

## Data Assessment 

First we will have a look at the data to assess the quality and identify areas for cleaning. We will have a look at:

* The shape of the data

* Column names and values

* Data types

* Missing values

* Duplicates (rows & columns)

In [4]:
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


### Shape of the data

In [5]:
print("Rows and columns:", df.shape)
print("Column names:", df.columns.tolist())

Rows and columns: (493, 11)
Column names: ['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


🧹 **Rows & Columns:**

* Column names need to be normalized

* There is a duplicate column: `SAT Critical Reading Avg. Score` & `SAT Critical Readng Avg. Score`

In [6]:
# Identify columns related to reading scores
[col for col in df.columns if "reading" in col.lower()]

['SAT Critical Reading Avg. Score']

In [7]:
# Check if columns are identical
df[["SAT Critical Reading Avg. Score", "SAT Critical Readng Avg. Score"]].info()
df[["SAT Critical Reading Avg. Score", "SAT Critical Readng Avg. Score"]].head()
print("Number of identical values in both columns:", (df["SAT Critical Reading Avg. Score"] == df["SAT Critical Readng Avg. Score"]).sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 2 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   SAT Critical Reading Avg. Score  493 non-null    object
 1   SAT Critical Readng Avg. Score   493 non-null    object
dtypes: object(2)
memory usage: 7.8+ KB
Number of identical values in both columns: 493


🧹 **Duplicate Column**: We can safely drop the duplicate column: `SAT Critical Readng Avg. Score`

### Value Assesment

In [8]:
cat_cols = df.select_dtypes(include=["object", "string"]).columns
for col in cat_cols:
    print(f"\n--- {col} ---")
    print(df[col].nunique(), "unique values")
    print(df[col].unique()[:10])  # show sample


--- DBN ---
478 unique values
['01M292' '01M448' '01M450' '01M458' '01M509' '01M515' '01M539' '01M650'
 '01M696' '02M047']

--- SCHOOL NAME ---
478 unique values
['HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES'
 'UNIVERSITY NEIGHBORHOOD HIGH SCHOOL' 'EAST SIDE COMMUNITY SCHOOL'
 'FORSYTH SATELLITE ACADEMY' 'MARTA VALLE HIGH SCHOOL'
 'LOWER EAST SIDE PREPARATORY HIGH SCHOOL'
 'NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND MATH HIGH SCHOOL'
 'CASCADES HIGH SCHOOL' 'BARD HIGH SCHOOL EARLY COLLEGE'
 '47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECONDARY SCHOOL']

--- Num of SAT Test Takers ---
175 unique values
['29' '91' '70' '7' '44' '112' '159' '18' '130' '16']

--- SAT Critical Reading Avg. Score ---
164 unique values
['355' '383' '377' '414' '390' '332' '522' '417' '624' '395']

--- SAT Math Avg. Score ---
177 unique values
['404' '423' '402' '401' '433' '557' '574' '418' '604' '400']

--- SAT Writing Avg. Score ---
163 unique values
['363' '366' '370' '359' '384' '316' '525' '411' '62

🧹 **Value Assesment:**

* `DBN` and `SCHOOL NAME` each have 478 unique values — no duplicates, both clean and consistent values.

* `contact_extension` has only 3 distinct formats and is non-analytical — can be removed.

* `pct_students_tested` uses percentage strings ("78%", "92%", "85%") and includes missing values — will need to be cleaned and converted to numeric (e.g., 0.78, 0.92, 0.85).


### Data Types

In [9]:
print("Data types overview:\n", df.dtypes)
print("\nUnique values per column:")
print(df.nunique())

Data types overview:
 DBN                                 object
SCHOOL NAME                         object
Num of SAT Test Takers              object
SAT Critical Reading Avg. Score     object
SAT Math Avg. Score                 object
SAT Writing Avg. Score              object
SAT Critical Readng Avg. Score      object
internal_school_id                   int64
contact_extension                   object
pct_students_tested                 object
academic_tier_rating               float64
dtype: object

Unique values per column:
DBN                                478
SCHOOL NAME                        478
Num of SAT Test Takers             175
SAT Critical Reading Avg. Score    164
SAT Math Avg. Score                177
SAT Writing Avg. Score             163
SAT Critical Readng Avg. Score     164
internal_school_id                 478
contact_extension                    3
pct_students_tested                  3
academic_tier_rating                 4
dtype: int64


🧹 **Data Types:**

* **Convert to Numeric**: 

  * `Num of SAT Test Takers` → Int64 (nullable integer)

  * `SAT Critical Reading Avg. Score` → Int64 (or float64)

  * `SAT Math Avg. Score` → Int64 (or float64)

  * `SAT Writing Avg. Score` → Int64 (or float64)

  * `pct_students_tested` (values like 78%, Missing value) → float64 after stripping % and coercing

* **Keep as String (pandas StringDtype)**:

  * `DBN` → string (preserves leading zeros)

  * `SCHOOL NAME` → string

  * `contact_extension` (values like x345) → string

### Missing values

In [10]:
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 DBN                                  0
SCHOOL NAME                          0
Num of SAT Test Takers               0
SAT Critical Reading Avg. Score      0
SAT Math Avg. Score                  0
SAT Writing Avg. Score               0
SAT Critical Readng Avg. Score       0
internal_school_id                   0
contact_extension                  105
pct_students_tested                117
academic_tier_rating                91
dtype: int64


🧹 **Missing values**: Three columns have missing data - `contact_extension` (105),  `pct_students_tested` (117), and `academic_tier_rating` (91).


In [11]:
# Summary of missing values in percentage
missing_summary = (
    df.isnull().mean().sort_values(ascending=False) * 100
).round(2)
missing_summary

pct_students_tested                23.73
contact_extension                  21.30
academic_tier_rating               18.46
DBN                                 0.00
SCHOOL NAME                         0.00
Num of SAT Test Takers              0.00
SAT Critical Reading Avg. Score     0.00
SAT Math Avg. Score                 0.00
SAT Writing Avg. Score              0.00
SAT Critical Readng Avg. Score      0.00
internal_school_id                  0.00
dtype: float64

🧹**Missing values**: We will drop the missing values when cleaning since they represent a small portion of the dataset and are not essential for the main SAT analysis.

### Duplicate Values

In [12]:
print("Duplicate rows:", df.duplicated().sum())

Duplicate rows: 15


In [13]:
# Display duplicate rows
df[df.duplicated(keep=False)].sort_values("DBN").head(10)

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
35,02M419,LANDMARK HIGH SCHOOL,62,390,399,381,390,166135,x123,78%,2.0
486,02M419,LANDMARK HIGH SCHOOL,62,390,399,381,390,166135,x123,78%,2.0
52,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0
484,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0
491,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0
99,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
490,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
487,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
481,07X221,SOUTH BRONX PREPARATORY: A COLLEGE BOARD SCHOOL,65,364,378,348,364,277389,x345,92%,
492,07X221,SOUTH BRONX PREPARATORY: A COLLEGE BOARD SCHOOL,65,364,378,348,364,277389,x345,92%,


🧹 **Duplicates**: There are 15 duplicate rows that can safely be dropped from the dataset.

## Data Cleaning

* Normalize column names

* Drop duplicate column: `SAT Critical Readng Avg. Score`

* Drop non analytical column: `contact_extension`

* Convert columns stored as object to appropriate data type (numeric or string)

* Drop rows with missing values in columns: `pct_students_tested` and `academic_tier_rating`

* Drop 15 duplicate rows (they contain identical data for the same DBN and School Name)

* Ensure all SAT-related numeric values fall within expected ranges (200-800)

* Confirm all categorical text data is normalized (no extra whitespace)

In [14]:
# Create a copy for cleaning
df_clean = df.copy()

### Normalize column names

➡️ Remove whitespace, lowercase, replace space with underscore, and remove punctuation. 

In [15]:
df_clean.columns = df_clean.columns.str.strip().str.lower().str.replace(" ", "_").str.replace(".", "", regex=False)
df_clean.columns.tolist()

['dbn',
 'school_name',
 'num_of_sat_test_takers',
 'sat_critical_reading_avg_score',
 'sat_math_avg_score',
 'sat_writing_avg_score',
 'sat_critical_readng_avg_score',
 'internal_school_id',
 'contact_extension',
 'pct_students_tested',
 'academic_tier_rating']

### Drop Columns

➡️ Duplicate column sat_critical_readng_avg_score and non-analytical column contact_extension.

In [16]:
# Drop duplicate column
df_clean = df_clean.drop(columns=["sat_critical_readng_avg_score"])

# Drop non-analytical column
df_clean = df_clean.drop(columns=["contact_extension"])

### Convert columns to appropriate data types

In [17]:
# Text Columns
for c in ["dbn", "school_name"]:
    if c in df_clean.columns:
        df_clean[c] = df_clean[c].astype("string").str.strip().str.replace(r"\s+", " ", regex=True)

# Numeric Columns
num_int_cols = [
    "num_of_sat_test_takers",
    "sat_critical_reading_avg_score",
    "sat_math_avg_score",
    "sat_writing_avg_score",
]
for c in num_int_cols:
    if c in df_clean.columns:
        df_clean[c] = pd.to_numeric(df_clean[c], errors="coerce").astype("Int64")

# Percentage Column
df_clean["pct_students_tested"] = (df_clean["pct_students_tested"].astype("string").str.strip().str.rstrip("%"))
df_clean["pct_students_tested"] = pd.to_numeric(df_clean["pct_students_tested"], errors="coerce") / 100.0

In [18]:
df_clean['academic_tier_rating']

0      2.0
1      3.0
2      3.0
3      4.0
4      2.0
      ... 
488    1.0
489    NaN
490    2.0
491    2.0
492    NaN
Name: academic_tier_rating, Length: 493, dtype: float64

### Drop rows with duplicate or missing values

In [19]:
# Drop duplicates
df_clean = df_clean.drop_duplicates()

# Drop missing values
df_clean = df_clean.dropna()

### Identify and handle invalid SAT scores (200-800)

➡️ We will check the range for the SAT score columns ans make sure they are valid, within range 200 to 800. Invalid rows will be dropped from the dataset.

In [20]:
score_cols = ["sat_critical_reading_avg_score", "sat_math_avg_score", "sat_writing_avg_score"]

for col in score_cols:
    invalid_mask = (df_clean[col] < 200) | (df_clean[col] > 800)
    invalid_count = invalid_mask.sum()

    if invalid_count > 0:
        print(f"⚠️ {invalid_count} invalid values found in '{col}' — outside 200–800 range.")
        display(df_clean.loc[invalid_mask, ["dbn", "school_name", col]])
        df_clean = df_clean.loc[~invalid_mask]
    else:
        print(f"✅ All values in '{col}' are within the valid 200–800 range.")

✅ All values in 'sat_critical_reading_avg_score' are within the valid 200–800 range.
⚠️ 3 invalid values found in 'sat_math_avg_score' — outside 200–800 range.


Unnamed: 0,dbn,school_name,sat_math_avg_score
80,03M415,WADLEIGH SECONDARY SCHOOL FOR THE PERFORMING &...,850
422,28Q470,JAMAICA HIGH SCHOOL,999
434,29Q283,PREPARATORY ACADEMY FOR WRITERS: A COLLEGE BOA...,1100


✅ All values in 'sat_writing_avg_score' are within the valid 200–800 range.


🧹 **Invalid SAT scores:** Three invalid SAT Math scores were detected (values: 850, 999, 1100), which fall outside the valid SAT range of 200–800. These rows were removed to ensure all score values reflect realistic test results and maintain data integrity.

In [21]:
# Validation check SAT score ranges
for col in ["sat_critical_reading_avg_score", "sat_math_avg_score", "sat_writing_avg_score"]:
    print(f"{col}: min={df_clean[col].min()}, max={df_clean[col].max()}")

sat_critical_reading_avg_score: min=279, max=621
sat_math_avg_score: min=312, max=660
sat_writing_avg_score: min=286, max=638


### Identify and handle outliers

➡️ We will look for outliers, inconsistencies, and negative values to ensure all values in the data are plausible. 

In [22]:
# Summary statistics of numeric columns after cleaning
desc = df_clean[["num_of_sat_test_takers"] + score_cols].describe().T
print("\nSummary of numeric columns after cleaning:")
display(desc)



Summary of numeric columns after cleaning:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_of_sat_test_takers,266.0,105.981203,135.899226,6.0,37.25,61.0,99.0,888.0
sat_critical_reading_avg_score,266.0,399.203008,53.871301,279.0,368.0,390.0,414.0,621.0
sat_math_avg_score,266.0,412.409774,61.329249,312.0,371.0,398.0,436.0,660.0
sat_writing_avg_score,266.0,391.883459,55.749283,286.0,359.0,380.0,408.0,638.0


In [23]:
# Identify and handle outliers in 'num_of_sat_test_takers'
invalid_takers = df_clean.loc[(df_clean["num_of_sat_test_takers"] <= 0) | (df_clean["num_of_sat_test_takers"] > 1000)]
if not invalid_takers.empty:
    print(f"⚠️ Found {len(invalid_takers)} implausible 'num_of_sat_test_takers' values. Inspect:")
    display(invalid_takers)
    # Drop them if clearly erroneous
    df_clean = df_clean.loc[(df_clean["num_of_sat_test_takers"] > 0) & (df_clean["num_of_sat_test_takers"] <= 1000)]
else:
    print("✅ All 'num_of_sat_test_takers' values are plausible (1–1000).")

✅ All 'num_of_sat_test_takers' values are plausible (1–1000).


### Final Check

➡️ Make sure the data has been cleaned properly and is ready for export to the database. 

In [24]:
print("\nFinal cleaned dataset shape:", df_clean.shape)
print("\nRemaining missing values per column:\n", df_clean.isna().sum())
print("Duplicate rows after cleaning:", df_clean.duplicated().sum())
print("\nFinal data types overview:\n", df_clean.dtypes)
print("\nCleaned dataset preview:")
display(df_clean.head())


Final cleaned dataset shape: (266, 9)

Remaining missing values per column:
 dbn                               0
school_name                       0
num_of_sat_test_takers            0
sat_critical_reading_avg_score    0
sat_math_avg_score                0
sat_writing_avg_score             0
internal_school_id                0
pct_students_tested               0
academic_tier_rating              0
dtype: int64
Duplicate rows after cleaning: 0

Final data types overview:
 dbn                               string[python]
school_name                       string[python]
num_of_sat_test_takers                     Int64
sat_critical_reading_avg_score             Int64
sat_math_avg_score                         Int64
sat_writing_avg_score                      Int64
internal_school_id                         int64
pct_students_tested                      Float64
academic_tier_rating                     float64
dtype: object

Cleaned dataset preview:


Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,218160,0.78,2.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,427826,0.92,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,672714,0.92,2.0
6,01M539,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",159,522,574,525,697107,0.78,2.0
7,01M650,CASCADES HIGH SCHOOL,18,417,418,411,297600,0.92,4.0


## Schema

➡️ Final cleaned data that will be exported to database. 

<table>
  <thead>
    <tr>
      <th>Column</th>
      <th>Type</th>
      <th>Description</th>
    </tr>
  </thead>
  <tbody>
    <tr><td><code>dbn</code></td><td><code>TEXT</code></td><td>unique school identifier (primary key)</td></tr>
    <tr><td><code>school_name</code></td><td><code>TEXT</code></td><td>name of the school</td></tr>
    <tr><td><code>num_of_sat_test_takers</code></td><td><code>INTEGER</code></td><td>number of students taking SAT</td></tr>
    <tr><td><code>sat_critical_reading_avg_score</code></td><td><code>INTEGER</code></td><td>average reading score</td></tr>
    <tr><td><code>sat_math_avg_score</code></td><td><code>INTEGER</code></td><td>average math score</td></tr>
    <tr><td><code>sat_writing_avg_score</code></td><td><code>INTEGER</code></td><td>average writing score</td></tr>
    <tr><td><code>pct_students_tested</code></td><td><code>DOUBLE PRECISION</code></td><td>proportion of students tested (0–1)</td></tr>
    <tr><td><code>academic_tier_rating</code></td><td><code>DOUBLE PRECISION</code></td><td>school tier rating</td></tr>
    <tr><td><code>internal_school_id</code></td><td><code>BIGINT</code></td><td>internal identifier</td></tr>
  </tbody>
</table>

## Export clean data

In [None]:
# Export clean data to the database
df_clean.to_sql(
    name='dido_sat_results',       
    con=engine,     
    schema='nyc_schools',
    if_exists='replace',    
    index=False            
)

266

In [27]:
# Export clean data to CSV
df_clean.to_csv('day_4_datasets/cleaned_sat_results.csv', index=False)