# 📆 Day 4 - Sat Modeling

## 🔌 Load the data

In [2]:
# import libraries

import numpy as np
import pandas as pd
import psycopg2
import sqlalchemy
import re

In [5]:
# import csv from github

url = 'https://raw.githubusercontent.com/SebastianNic/data_host/refs/heads/main/sat-results.csv'

sat_df_original = pd.read_csv(url)

In [6]:
# import additional csv for comparison

url_hsd = 'https://raw.githubusercontent.com/SebastianNic/data_host/refs/heads/main/high-school-directory.csv'
high_school_dir_df = pd.read_csv(url_hsd)

In [7]:
sat_df_original.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


## 🛁 Clean the data

In [8]:
sat_df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


### 🗑️ Dropping abiguous columns or unnecessary columns

#### Ambigous columns
Appearently the 'SAT Critical Reading column is there twice. So I'll check wether or not the two columns match and if they do remove one instance.

In [9]:
#initiate cleaned df

sat_df_cleaned = sat_df_original.copy()

In [10]:
# check for match

sat_df_cleaned[sat_df_cleaned['SAT Critical Reading Avg. Score'] != sat_df_cleaned['SAT Critical Readng Avg. Score']]

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating


In [11]:
# both columns are identical in each single row, so I'll drop one

sat_df_cleaned.drop('SAT Critical Readng Avg. Score', axis=1, inplace=True)

The school name column might be ambigous since the school name is already in the high school dorectory table. But if there's schools in the sat table that dont appear in the hsd table i'll have to keep the column.

In [12]:
# creating a a df with dbn and name from high_school_directory table

dbn_name_list = high_school_dir_df[['dbn', 'school_name', 'total_students']]

In [13]:
# merging with sat table to see if every school is present

df_merged = dbn_name_list.merge(sat_df_cleaned, how='outer', left_on='dbn', right_on='DBN')

df_merged[df_merged['dbn'] == df_merged['DBN']] # unfortunately not

Unnamed: 0,dbn,school_name,total_students,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,Henry Street School for International Studies,323.0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,218160.0,x345,78%,2.0
1,01M448,University Neighborhood High School,299.0,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,268547.0,x234,,3.0
2,01M450,East Side Community School,649.0,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,236446.0,x123,,3.0
4,01M509,Marta Valle High School,401.0,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,672714.0,x123,92%,2.0
6,01M539,"New Explorations into Science, Technology and ...",1725.0,01M539,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",159,522,574,525,697107.0,,78%,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
543,32K545,EBC High School for Public Service - Bushwick,513.0,32K545,EBC HIGH SCHOOL FOR PUBLIC SERVICE–BUSHWICK,88,384,409,361,221431.0,,85%,3.0
544,32K549,Bushwick School for Social Justice,416.0,32K549,BUSHWICK SCHOOL FOR SOCIAL JUSTICE,75,348,353,334,954491.0,x234,85%,3.0
545,32K552,Academy of Urban Planning,313.0,32K552,ACADEMY OF URBAN PLANNING,67,342,364,354,695572.0,x234,92%,3.0
546,32K554,All City Leadership Secondary School,333.0,32K554,ALL CITY LEADERSHIP SECONDARY SCHOOL,39,428,465,422,587220.0,,,2.0


#### Unnecessary columns

The following columns are irrelevant
* internal school id: since there's already an unique identifier with the dbn and the internal school id is generated by the DOE only for there internal use

* contact extension: is a phone number extension. not relevant for any analysis

* pct_students_tested: this column can easily be calculated with the number of total students and the number of students who got tested

* academic tear rating: the rating doesn't use an appearent pattern. Since there is no official unified school ranking in nyc or the us in general and its not comprehensive what this rating is based on, its getting dropped

In [14]:
sat_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   internal_school_id               493 non-null    int64  
 7   contact_extension                388 non-null    object 
 8   pct_students_tested              376 non-null    object 
 9   academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(8)
memory usage: 38.6+ KB


In [15]:
# drop all the according columns

list_of_cols_to_drop = ['internal_school_id', 'contact_extension', 
                        'pct_students_tested', 'academic_tier_rating']

sat_df_cleaned.drop(list_of_cols_to_drop, axis=1, inplace=True)

### ♻️ Handling duplicates and missing values

**Handling duplicates**

In [16]:
# check for duplicates

assert sat_df_cleaned.duplicated().any()

In [17]:
# create series with duplicate indexes

sat_duplicated = sat_df_cleaned.duplicated(['DBN', 'SCHOOL NAME'],keep=False) # create series with duplicate indexes

sat_duplicated_df = sat_df_cleaned[sat_duplicated] # subsetting df with series to see duplicate rows

sat_duplicated_df.sort_values('DBN').head() # checking

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score
35,02M419,LANDMARK HIGH SCHOOL,62,390,399,381
486,02M419,LANDMARK HIGH SCHOOL,62,390,399,381
52,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393
484,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393
491,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393


In [18]:
# time to remove duplicates

sat_df_cleaned.drop_duplicates(keep='first', inplace=True)

In [19]:
sat_df_cleaned.duplicated().any()

np.False_

**Handling missing values.**
The challenge is, that in the SAT columns there is missing values but they are marked with 's'.
Therefore the columns have the wrong data type and can not be used properly.
The approach is to replace all 's'-values with nan.

In [20]:
# create a function that will iterate thru all effected columns and replace s with nan

def replace_s_with_nan(df):
    for i in range(478):
        for n in range(2,6):
            if df.iloc[i,n] == 's':
                df.iloc[i,n] = np.nan
            else:
                continue

In [21]:
# run function and hope that it works

replace_s_with_nan(sat_df_cleaned)

In [22]:
sat_df_cleaned[sat_df_cleaned['Num of SAT Test Takers'].isnull()].head(3) # ...worked!

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score
22,02M392,MANHATTAN BUSINESS ACADEMY,,,,
23,02M393,BUSINESS OF SPORTS SCHOOL,,,,
25,02M399,THE HIGH SCHOOL FOR LANGUAGE AND DIPLOMACY,,,,


### 💾 Update data types

Update data types of measure columns

In [23]:
sat_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           421 non-null    object
 3   SAT Critical Reading Avg. Score  421 non-null    object
 4   SAT Math Avg. Score              421 non-null    object
 5   SAT Writing Avg. Score           421 non-null    object
dtypes: object(6)
memory usage: 26.1+ KB


In [24]:
# selecting effected columns and changing data type to float

sat_df_cleaned[['Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 
                'SAT Math Avg. Score', 'SAT Writing Avg. Score']] = sat_df_cleaned[['Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 
                'SAT Math Avg. Score', 'SAT Writing Avg. Score']].astype('float')

In [25]:
sat_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              478 non-null    object 
 1   SCHOOL NAME                      478 non-null    object 
 2   Num of SAT Test Takers           421 non-null    float64
 3   SAT Critical Reading Avg. Score  421 non-null    float64
 4   SAT Math Avg. Score              421 non-null    float64
 5   SAT Writing Avg. Score           421 non-null    float64
dtypes: float64(4), object(2)
memory usage: 26.1+ KB


### 📝 rename columns in correct format

In [26]:
# create function to clean column names 

def clean_columns(x):

  x = x.replace(' ', '_') # replacing whitespaces with underscores

  x = x.lower() # making the strings lower

  x = re.sub('[^\w]', '', x) # removing special characters

  return x

In [27]:
# call function to clean column names 

sat_df_cleaned.rename(clean_columns, axis='columns', inplace=True)

### 📐 Outliers and Ranges

**Checking for realistic test result values.**
SAT results range from 200 to 800 points per field. Therefore some of the avg scores are simply impossible. The approach here is to correct every value below 200 to exaclty 200 hundred and the other way around for the maximum possible score.

In [54]:
sat_df_cleaned.describe()

Unnamed: 0,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score
count,421.0,421.0,421.0,421.0
mean,110.320665,400.850356,416.895487,393.985748
std,155.534254,56.802783,75.415096,58.635109
min,6.0,279.0,200.0,286.0
25%,41.0,368.0,372.0,360.0
50%,62.0,391.0,395.0,381.0
75%,95.0,416.0,438.0,411.0
max,1277.0,679.0,800.0,682.0


In [29]:
# create a function that iterates thru the df and corrects all values that fall out of the possible range

def correct_sat_results(y):
    
    for i in range(3,6): # iterates thru columns
        for n in range(0,478): # iterates thru rows

            if y.iloc[n,i] < 200: # checks for minimum
                y.iloc[n,i] = 200

            elif y.iloc[n,i] > 800: # checks for maximum
                y.iloc[n,i] = 800

            else:
                continue

In [30]:
# call the function

correct_sat_results(sat_df_cleaned)

In [31]:
sat_df_cleaned.describe()

Unnamed: 0,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score
count,421.0,421.0,421.0,421.0
mean,110.320665,400.850356,416.895487,393.985748
std,155.534254,56.802783,75.415096,58.635109
min,6.0,279.0,200.0,286.0
25%,41.0,368.0,372.0,360.0
50%,62.0,391.0,395.0,381.0
75%,95.0,416.0,438.0,411.0
max,1277.0,679.0,800.0,682.0


**Checking for impossible numbers of test takers.** There is a total_students column in the high school dir dataframe. I intend to use it to check for realistic test takers numbers.

In [None]:
# merge with high_school_dir_df to get total count of students for each school
# and check wether or not the num of test takers exceeds that

sat_test_takers_check = sat_df_cleaned.merge(high_school_dir_df, 
                                             how='inner', on='dbn')[['dbn', 'total_students', 'num_of_sat_test_takers']]

In [52]:
# create a boolean array where every row with a higher number of test takers than total students is 
# marked as true

more_tests_than_students = np.where(
    sat_test_takers_check['total_students'] 
         >= sat_test_takers_check['num_of_sat_test_takers'], 
         False, True)

In [None]:
more_tests_than_students.sum()

np.int64(30)

So there are a few entries with incorrect num of test takers. Thinking about it however, it can't be asured that both numbers (total students and num of test takers) were recorded in the same year. Therefore I'll leave it as is.

## 📤 Upload cleaned dataset

In [55]:
# establish database connection

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
    )

In [56]:
engine = sqlalchemy.create_engine(DATABASE_URL)

In [57]:
# upload sat_df_cleaned to database

sat_df_cleaned.to_sql(
    name = 'sebastian_sat_results',
    con=engine,
    schema='nyc_schools',
    if_exists='replace',
    index=False
)

478

Save csv on local

In [58]:
local_url = '/Users/bastianlenkers/Documents/Masterschool/Webeet/Onboarding_D4/sebastian_sat_results.csv'

sat_df_cleaned.to_csv(local_url, index=False)