# 📆 Day 4 - Sat Modeling

## 🔌 Load the data

In [91]:
# import libraries

import numpy as np
import pandas as pd
import psycopg2
import sqlalchemy
import re

In [None]:
# import csv from github

url = 'https://raw.githubusercontent.com/webeet-io/_onboarding_data/refs/heads/main/daily_tasks/day_4/day_4_datasets/sat-results.csv?token=GHSAT0AAAAAADISP6UOI6BQEYXKLBW4J6O22EURHRA'

sat_df_original = pd.read_csv(url)

In [11]:
# import additional csv for comparison

url_hsd = 'https://raw.githubusercontent.com/webeet-io/_onboarding_data/refs/heads/main/daily_tasks/day_2/day_2_datasets/high-school-directory.csv?token=GHSAT0AAAAAADISP6UOKNUY3AVEHNHB25TY2EUR7IQ'

high_school_dir_df = pd.read_csv(url_hsd)

In [4]:
sat_df_original.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


## 🛁 Clean the data

In [5]:
sat_df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


### 🗑️ Dropping abiguous columns or unnecessary columns

#### Ambigous columns
Appearently the 'SAT Critical Reading column is there twice. So I'll check wether or not the two columns match and if they do remove one instance.

In [None]:
#initiate cleaned df

sat_df_cleaned = sat_df_original.copy()

In [8]:
# check for match

sat_df_cleaned[sat_df_cleaned['SAT Critical Reading Avg. Score'] != sat_df_cleaned['SAT Critical Readng Avg. Score']]

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating


In [9]:
# both columns are identical in each single row, so I'll drop one

sat_df_cleaned.drop('SAT Critical Readng Avg. Score', axis=1, inplace=True)

The school name column might be ambigous since the school name is already in the high school dorectory table. But if there's schools in the sat table that dont appear in the hsd table i'll have to keep the column.

In [44]:
# creating a a df with dbn and name from high_school_directory table

dbn_name_list = high_school_dir_df[['dbn', 'school_name', 'total_students']]

In [45]:
# merging with sat table to see if every school is present

df_merged = dbn_name_list.merge(sat_df_cleaned, how='outer', left_on='dbn', right_on='DBN')

df_merged[df_merged['dbn'] == df_merged['DBN']] # unfortunately not

Unnamed: 0,dbn,school_name,total_students,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,Henry Street School for International Studies,323.0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,218160.0,x345,78%,2.0
1,01M448,University Neighborhood High School,299.0,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,268547.0,x234,,3.0
2,01M450,East Side Community School,649.0,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,236446.0,x123,,3.0
4,01M509,Marta Valle High School,401.0,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,672714.0,x123,92%,2.0
6,01M539,"New Explorations into Science, Technology and ...",1725.0,01M539,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",159,522,574,525,697107.0,,78%,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
528,32K545,EBC High School for Public Service - Bushwick,513.0,32K545,EBC HIGH SCHOOL FOR PUBLIC SERVICE–BUSHWICK,88,384,409,361,221431.0,,85%,3.0
529,32K549,Bushwick School for Social Justice,416.0,32K549,BUSHWICK SCHOOL FOR SOCIAL JUSTICE,75,348,353,334,954491.0,x234,85%,3.0
530,32K552,Academy of Urban Planning,313.0,32K552,ACADEMY OF URBAN PLANNING,67,342,364,354,695572.0,x234,92%,3.0
531,32K554,All City Leadership Secondary School,333.0,32K554,ALL CITY LEADERSHIP SECONDARY SCHOOL,39,428,465,422,587220.0,,,2.0


#### Unnecessary columns

The following columns are irrelevant
* internal school id: since there's already an unique identifier with the dbn and the internal school id is generated by the DOE only for there internal use

* contact extension: is a phone number extension. not relevant for any analysis

* pct_students_tested: this column can easily be calculated with the number of total students and the number of students who got tested

* academic tear rating: the rating doesn't use an appearent pattern. Since there is no official unified school ranking in nyc or the us in general and its not comprehensive what this rating is based on, its getting dropped

In [22]:
sat_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   internal_school_id               493 non-null    int64  
 7   contact_extension                388 non-null    object 
 8   pct_students_tested              376 non-null    object 
 9   academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(8)
memory usage: 38.6+ KB


In [82]:
# drop all the according columns

list_of_cols_to_drop = ['internal_school_id', 'contact_extension', 
                        'pct_students_tested', 'academic_tier_rating']

sat_df_cleaned.drop(list_of_cols_to_drop, axis=1, inplace=True)

### ♻️ Handling duplicates and missing values

**Handling duplicates**

In [24]:
# check for duplicates

assert sat_df_cleaned.duplicated().any()

In [38]:
# create series with duplicate indexes

sat_duplicated = sat_df_cleaned.duplicated(['DBN', 'SCHOOL NAME'],keep=False) # create series with duplicate indexes

sat_duplicated_df = sat_df_cleaned[sat_duplicated] # subsetting df with series to see duplicate rows

sat_duplicated_df.sort_values('DBN').head() # checking

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
35,02M419,LANDMARK HIGH SCHOOL,62,390,399,381,166135,x123,78%,2.0
486,02M419,LANDMARK HIGH SCHOOL,62,390,399,381,166135,x123,78%,2.0
52,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,892839,,92%,2.0
484,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,892839,,92%,2.0
491,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,892839,,92%,2.0


In [None]:
# time to remove duplicates

sat_df_cleaned.drop_duplicates(keep='first', inplace=True)

In [41]:
sat_df_cleaned.duplicated().any()

np.False_

**Handling missing values.**
The challenge is, that in the SAT columns there is missing values but they are marked with 's'.
Therefore the columns have the wrong data type and can not be used properly.
The approach is to replace all 's'-values with nan.

In [67]:
# create a function that will iterate thru all effected columns and replace s with nan

def replace_s_with_nan(df):
    for i in range(478):
        for n in range(2,6):
            if df.iloc[i,n] == 's':
                df.iloc[i,n] = np.nan
            else:
                continue

In [68]:
# run function and hope that it works

replace_s_with_nan(sat_df_cleaned)

In [77]:
sat_df_cleaned[sat_df_cleaned['Num of SAT Test Takers'].isnull()].head(3) # ...worked!

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
22,02M392,MANHATTAN BUSINESS ACADEMY,,,,,501072,,85%,
23,02M393,BUSINESS OF SPORTS SCHOOL,,,,,143204,x234,,2.0
25,02M399,THE HIGH SCHOOL FOR LANGUAGE AND DIPLOMACY,,,,,371710,x123,85%,


### 💾 Update data types

Update data types of measure columns

In [83]:
sat_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           421 non-null    object
 3   SAT Critical Reading Avg. Score  421 non-null    object
 4   SAT Math Avg. Score              421 non-null    object
 5   SAT Writing Avg. Score           421 non-null    object
dtypes: object(6)
memory usage: 42.3+ KB


In [None]:
# selecting effected columns and changing data type to float

sat_df_cleaned[['Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 
                'SAT Math Avg. Score', 'SAT Writing Avg. Score']] = sat_df_cleaned[['Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 
                'SAT Math Avg. Score', 'SAT Writing Avg. Score']].astype('float')

In [86]:
sat_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              478 non-null    object 
 1   SCHOOL NAME                      478 non-null    object 
 2   Num of SAT Test Takers           421 non-null    float64
 3   SAT Critical Reading Avg. Score  421 non-null    float64
 4   SAT Math Avg. Score              421 non-null    float64
 5   SAT Writing Avg. Score           421 non-null    float64
dtypes: float64(4), object(2)
memory usage: 42.3+ KB


### 📝 rename columns in correct format

In [92]:
# create function to clean column names 

def clean_columns(x):

  x = x.replace(' ', '_') # replacing whitespaces with underscores

  x = x.lower() # making the strings lower

  x = re.sub('[^\w]', '', x) # removing special characters

  return x

In [93]:
# call function to clean column names 

sat_df_cleaned.rename(clean_columns, axis='columns', inplace=True)

## 📤 Upload cleaned dataset

In [88]:
# establish database connection

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
    )

In [89]:
engine = sqlalchemy.create_engine(DATABASE_URL)

In [95]:
# upload sat_df_cleaned to database

sat_df_cleaned.to_sql(
    name = 'sebastian_sat_results',
    con=engine,
    schema='nyc_schools',
    if_exists='replace',
    index=False
)

478

Save csv on local

In [96]:
local_url = '/Users/bastianlenkers/Documents/Masterschool/Webeet/Onboarding_D4/sebastian_sat_results.csv'

sat_df_cleaned.to_csv(local_url, index=False)