In [21]:
# Import necessary libraries
import pandas as pd

# CSV-Datei laden
df = pd.read_csv('sat-results.csv')

# Take a first look
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


In [23]:
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
DBN                                  0
SCHOOL NAME                          0
Num of SAT Test Takers               0
SAT Critical Reading Avg. Score      0
SAT Math Avg. Score                  0
SAT Writing Avg. Score               0
SAT Critical Readng Avg. Score       0
internal_school_id                   0
contact_extension                  105
pct_students_tested                117
academic_tier_rating                91
dtype: int64


In [24]:
# Clean up column names: convert to lowercase, replace spaces with underscores
df.columns = (
    df.columns
    .str.strip()                          # Removes leading/trailing spaces
    .str.lower()                          # Write everything in lowercase
    .str.replace(' ', '_')                # Replace spaces with underscores
    .str.replace('.', '', regex=False)    # Remove points
)

# Check new column names
df.columns

Index(['dbn', 'school_name', 'num_of_sat_test_takers',
       'sat_critical_reading_avg_score', 'sat_math_avg_score',
       'sat_writing_avg_score', 'sat_critical_readng_avg_score',
       'internal_school_id', 'contact_extension', 'pct_students_tested',
       'academic_tier_rating'],
      dtype='object')

In [25]:
# Remove double column with typo
df = df.drop(columns=['sat_critical_readng_avg_score'])

# Check
df.columns

Index(['dbn', 'school_name', 'num_of_sat_test_takers',
       'sat_critical_reading_avg_score', 'sat_math_avg_score',
       'sat_writing_avg_score', 'internal_school_id', 'contact_extension',
       'pct_students_tested', 'academic_tier_rating'],
      dtype='object')

In [26]:
# Convert SAT-related columns to numeric values
cols_to_numeric = [
    'num_of_sat_test_takers',
    'sat_critical_reading_avg_score',
    'sat_math_avg_score',
    'sat_writing_avg_score'
]

for col in cols_to_numeric:
    # Invalid values (e.g., ‘s’) are converted to NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')

# check
df[cols_to_numeric].dtypes

Unnamed: 0,0
num_of_sat_test_takers,float64
sat_critical_reading_avg_score,float64
sat_math_avg_score,float64
sat_writing_avg_score,float64


In [27]:
# Adjust percentages and convert to float
df['pct_students_tested'] = (
    df['pct_students_tested']
    .str.replace('%', '')         # Prozentzeichen entfernen
    .astype(float)                # In float umwandeln
)

# check
df['pct_students_tested'].head()

Unnamed: 0,pct_students_tested
0,78.0
1,
2,
3,92.0
4,92.0


In [28]:
# Remove unnecessary columns
df = df.drop(columns=[
    'contact_extension',
    'internal_school_id',
    'academic_tier_rating'
])

# check
df.columns

Index(['dbn', 'school_name', 'num_of_sat_test_takers',
       'sat_critical_reading_avg_score', 'sat_math_avg_score',
       'sat_writing_avg_score', 'pct_students_tested'],
      dtype='object')

In [29]:
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,78.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,92.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,92.0


In [30]:
# Save cleaned data set as CSVNew overview of columns
df.to_csv('cleaned_sat_results.csv', index=False)

### SQL Schema Design

| Column Name                     | SQL Name                  | Type     | Description                            |
|--------------------------------|---------------------------|----------|----------------------------------------|
| dbn                            | dbn                       | TEXT     | Unique school code                     |
| school_name                    | school_name               | TEXT     | Full name of the high school           |
| num_of_sat_test_takers         | num_of_sat_test_takers    | INTEGER  | Number of students who took the SAT    |
| sat_critical_reading_avg_score | reading_score             | INTEGER  | Avg. score in Critical Reading         |
| sat_math_avg_score             | math_score                | INTEGER  | Avg. score in Math                     |
| sat_writing_avg_score          | writing_score             | INTEGER  | Avg. score in Writing                  |
| pct_students_tested            | pct_students_tested       | FLOAT    | % of students tested (0–100)           |

In [31]:
from sqlalchemy import create_engine
import pandas as pd

# Connecting to Neon
DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_YokKIbma3Er4"
    "@ep-spring-glitter-a9fw3n7k-pooler.gwc.azure.neon.tech/neondb?sslmode=require"
)

# Download cleaned CSV
df = pd.read_csv("cleaned_sat_results.csv")

# Creating a connection
engine = create_engine(DATABASE_URL)

# Uploading to a table
df.to_sql("sat_scores", con=engine, if_exists="replace", index=False)

493

## NYC SAT Dataset – Cleaning & Database Ingestion


### Cleaning Logic

- Converted column names to `snake_case` (lowercase, underscores, no special characters)
- Removed duplicate column `sat_critical_readng_avg_score` (typo version of another column)
- Converted SAT scores and test taker counts from strings to integers
- Cleaned `%` symbols from `pct_students_tested` and converted to float
- Dropped unused columns: `internal_school_id`, `contact_extension`, and `academic_tier_rating`

---

### Challenges

- SAT scores were stored as strings and needed conversion
- `%` characters in percentage columns required stripping
- Redundant or irrelevant columns needed to be excluded from the database schema


## Cloud Deployment via Neon

### SQLAlchemy connection string:
```python
from sqlalchemy import create_engine
import pandas as pd

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_YokKIbma3Er4"
    "@ep-spring-glitter-a9fw3n7k-pooler.gwc.azure.neon.tech/neondb?sslmode=require"
)

engine = create_engine(DATABASE_URL)
df = pd.read_csv("cleaned_sat_results.csv")
df.to_sql("sat_scores", con=engine, if_exists="replace", index=False)