In [3]:
!pip install psycopg2-binary


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
#importing libraries
import pandas as pd
import seaborn as sns  #load dataset
import numpy as np #statistical calculations
import matplotlib.pyplot as plt # for visualization
import psycopg2

In [5]:
df = pd.read_csv("sat-results.csv")

In [6]:
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


## Handling the sat data

### 1. Rename Columns for Consistency

In [9]:
df.columns = (
    df.columns.str.lower()                                   # Make lowercase
              .str.replace(r'[^a-z0-9\s_]', '', regex=True)  # Remove special characters
              .str.replace(r'\s+', '_', regex=True)          # Replace spaces with underscores
)
df.columns

Index(['dbn', 'school_name', 'num_of_sat_test_takers',
       'sat_critical_reading_avg_score', 'sat_math_avg_score',
       'sat_writing_avg_score', 'sat_critical_readng_avg_score',
       'internal_school_id', 'contact_extension', 'pct_students_tested',
       'academic_tier_rating'],
      dtype='object')

In [11]:
## renaming column names
df.rename(columns={
   
    'num_of_sat_test_takers': 'num_sat_test_takers',
    'sat_critical_reading_avg_score': 'sat_critical_reading',
    'sat_math_avg_score': 'sat_math',
    'sat_writing_avg_score': 'sat_writing',
    'sat_critical_readng_avg_score': 'sat_critical_reading_alt',  # dropped later
    'contact_extension': 'contact_ext',
    'pct_students_tested': 'pct_tested',
    'academic_tier_rating': 'academic_tier'
}, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   dbn                       493 non-null    object 
 1   school_name               493 non-null    object 
 2   num_sat_test_takers       493 non-null    object 
 3   sat_critical_reading      493 non-null    object 
 4   sat_math                  493 non-null    object 
 5   sat_writing               493 non-null    object 
 6   sat_critical_reading_alt  493 non-null    object 
 7   internal_school_id        493 non-null    int64  
 8   contact_ext               388 non-null    object 
 9   pct_tested                376 non-null    object 
 10  academic_tier             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


#### Drop Redundant or Duplicate Columns

In [13]:
df.drop(columns=['sat_critical_reading_alt'], inplace=True)


In [17]:
## Converting Relevant Columns to Numeric

cols_to_numeric = ['num_sat_test_takers', 'sat_critical_reading', 'sat_math', 'sat_writing', 'pct_tested']
for col in cols_to_numeric:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    
cols_to_numeric


['num_sat_test_takers',
 'sat_critical_reading',
 'sat_math',
 'sat_writing',
 'pct_tested']

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   dbn                   493 non-null    object 
 1   school_name           493 non-null    object 
 2   num_sat_test_takers   435 non-null    float64
 3   sat_critical_reading  435 non-null    float64
 4   sat_math              435 non-null    float64
 5   sat_writing           435 non-null    float64
 6   internal_school_id    493 non-null    int64  
 7   contact_ext           388 non-null    object 
 8   pct_tested            0 non-null      float64
 9   academic_tier         402 non-null    float64
dtypes: float64(6), int64(1), object(3)
memory usage: 38.6+ KB


### Checking and handling missing values

In [20]:
missing = df.isnull().sum()  #to check the null values in the dataset and their %
missing_percentage = (missing / len(df)) * 100
print(pd.concat([missing, missing_percentage.round(2)], axis=1, keys=["Missing", "%"]).sort_values(by="%", ascending=False))


                      Missing       %
pct_tested                493  100.00
contact_ext               105   21.30
academic_tier              91   18.46
num_sat_test_takers        58   11.76
sat_critical_reading       58   11.76
sat_math                   58   11.76
sat_writing                58   11.76
dbn                         0    0.00
school_name                 0    0.00
internal_school_id          0    0.00


In [21]:
# i drop the column entirely since it’s completely missing 
df.drop(columns=['pct_tested'], inplace=True)


In [22]:
# i replace these column with unknown and Keep for reference.

df['contact_ext'] = df['contact_ext'].fillna("Unknown")


In [23]:
# I keep this column and replace  missing with the median  to avoid skewing the dataset.

df['academic_tier'] = df['academic_tier'].fillna(df['academic_tier'].median())


In [24]:
# i observe that if num_sat_test_takers is missing → also SAT scores likely missing.

# so  drop these rows for the purpose of SAT analysis only.

df = df.dropna(subset=['num_sat_test_takers', 'sat_critical_reading', 'sat_math', 'sat_writing'])


In [25]:
df.isnull().sum()

dbn                     0
school_name             0
num_sat_test_takers     0
sat_critical_reading    0
sat_math                0
sat_writing             0
internal_school_id      0
contact_ext             0
academic_tier           0
dtype: int64

In [26]:
df.to_csv("sat_cleaned_results.csv", index=False)
