### **Import Libraries**

In [30]:
import pandas as pd
from sqlalchemy import create_engine

### **Load Dataset**

In [14]:
df = pd.read_csv('sat-results.csv')
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


### **Explore the dataset**

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


In [16]:
df.isnull().sum()

DBN                                  0
SCHOOL NAME                          0
Num of SAT Test Takers               0
SAT Critical Reading Avg. Score      0
SAT Math Avg. Score                  0
SAT Writing Avg. Score               0
SAT Critical Readng Avg. Score       0
internal_school_id                   0
contact_extension                  105
pct_students_tested                117
academic_tier_rating                91
dtype: int64

### **Clean The Dataset**

##### Normalize column head

In [17]:
# Normalize column headings
df.columns = (df.columns.str.strip()
                     .str.lower()
                     .str.replace(" ", "_")
                     .str.replace("[^0-9a-zA-Z_]", "", regex=True))

In [18]:
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,sat_critical_readng_avg_score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


##### Column Duplication Issue

We identified that the columns 'sat_critical_readng_avg_score' and 'sat_critical_reading_avg_score' are identical. The column 'sat_critical_readng_avg_score' contains a typo and will be removed to avoid redundancy.

In [19]:
# check if column 'sat_critical_reading_avg_score' and 'sat_critical_readng_avg_score' are same
if 'sat_critical_reading_avg_score' in df.columns and 'sat_critical_readng_avg_score' in df.columns:
    if df['sat_critical_reading_avg_score'].equals(df['sat_critical_readng_avg_score']):
        print("Columns are identical. Dropping the duplicate column.")
        df = df.drop(columns=["sat_critical_readng_avg_score"])
    else:
        print("Warning: Columns have different values.")
else:
    print("One or both columns are missing. No action taken.")


Columns are identical. Dropping the duplicate column.


In [20]:
# Drop unnecessary columns 
df = df.drop(columns=[
    "sat_critical_readng_avg_score",  # Duplicate column
    "internal_school_id",             # Since DBN is the unique key already present
    "contact_extension"               # Mostly null values, not useful
], errors="ignore")

##### Data Type Adjustments

Some columns have incorrect data types that need to be adjusted for accurate analysis. And we removed NAN values with 0.

In [21]:
# Strip leading/trailing spaces from column names
df.columns = df.columns.str.strip()
print(df.columns)
# Verify and update columns_to_change
columns_to_change = ['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score', 'num_of_sat_test_takers']

# Apply the transformation
df[columns_to_change] = df[columns_to_change].apply(pd.to_numeric, errors='coerce').astype('float64')

df.dtypes

Index(['dbn', 'school_name', 'num_of_sat_test_takers',
       'sat_critical_reading_avg_score', 'sat_math_avg_score',
       'sat_writing_avg_score', 'pct_students_tested', 'academic_tier_rating'],
      dtype='object')


dbn                                object
school_name                        object
num_of_sat_test_takers            float64
sat_critical_reading_avg_score    float64
sat_math_avg_score                float64
sat_writing_avg_score             float64
pct_students_tested                object
academic_tier_rating              float64
dtype: object

#### Handling Percentage Values

The column 'pct_students_tested' contains percentage signs ('%') that need to be removed for accurate numerical analysis. The '%' sign will be stripped, and the values will be converted to a numeric type (float), and we fill NAN values with 0 for further processing.

In [22]:
# Remove the '%' sign and convert to numeric
df['pct_students_tested'] = df['pct_students_tested'].str.replace('%', '').astype(float)

# Verify the changes
df['pct_students_tested'].head()

0    78.0
1     NaN
2     NaN
3    92.0
4    92.0
Name: pct_students_tested, dtype: float64

##### Handle Duplicates

In [23]:
# Ensure new_df is a DataFrame
if df is None or not isinstance(df, pd.DataFrame):
	df = df.drop_duplicates()

# Print the number of duplicated rows and the shape of the DataFrame
print(df.duplicated().sum())
print(df.shape)

# Display the first few rows of the DataFrame
df.head()

15
(493, 8)


Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,78.0,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,92.0,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,92.0,2.0


In [24]:
df = df.drop_duplicates()
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,78.0,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,92.0,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,92.0,2.0


In [25]:
# check missing values 
df.isnull().sum()

dbn                                 0
school_name                         0
num_of_sat_test_takers             57
sat_critical_reading_avg_score     57
sat_math_avg_score                 57
sat_writing_avg_score              57
pct_students_tested               115
academic_tier_rating               86
dtype: int64

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 0 to 477
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             478 non-null    object 
 1   school_name                     478 non-null    object 
 2   num_of_sat_test_takers          421 non-null    float64
 3   sat_critical_reading_avg_score  421 non-null    float64
 4   sat_math_avg_score              421 non-null    float64
 5   sat_writing_avg_score           421 non-null    float64
 6   pct_students_tested             363 non-null    float64
 7   academic_tier_rating            392 non-null    float64
dtypes: float64(6), object(2)
memory usage: 33.6+ KB


##### Handle Outliers
Filter Invalid SAT Scores: Ensure all SAT scores are within the valid range (200–800). 


In [27]:
# Define the valid SAT score range
valid_score_range = (200, 800)

# List of SAT score columns to filter
sat_cols = ['sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score']

# Convert SAT score columns to numeric, coercing errors to NaN
#new_df[sat_cols] = new_df[sat_cols].apply(pd.to_numeric, errors='coerce')

# Filter rows where all SAT scores are within the valid range
for col in sat_cols:
    df = df[(df[col] >= valid_score_range[0]) & (df[col] <= valid_score_range[1])]

# Verify the filtered dataset
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 416 entries, 0 to 477
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             416 non-null    object 
 1   school_name                     416 non-null    object 
 2   num_of_sat_test_takers          416 non-null    float64
 3   sat_critical_reading_avg_score  416 non-null    float64
 4   sat_math_avg_score              416 non-null    float64
 5   sat_writing_avg_score           416 non-null    float64
 6   pct_students_tested             313 non-null    float64
 7   academic_tier_rating            349 non-null    float64
dtypes: float64(6), object(2)
memory usage: 29.2+ KB
None
      dbn                                    school_name  \
0  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448            UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                     EAST SIDE COMMUNITY SCH

##### Create Total SAT Score Column

In [28]:
# Sum SAT scores (row-wise)
df["sat_total_avg_score"] = df[sat_cols].sum(axis=1, min_count=1)
df.head()

Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,pct_students_tested,academic_tier_rating,sat_total_avg_score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,78.0,2.0,1122.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,,3.0,1172.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,,3.0,1149.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,92.0,4.0,1174.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,92.0,2.0,1207.0


#### Save Cleaned DATA to CSV

In [29]:
# Save the cleaned CSV file
df.to_csv("cleaned_sat_results.csv", index=False)
print("✅ Cleaned data saved to cleaned_sat_results.csv")


✅ Cleaned data saved to cleaned_sat_results.csv


In [31]:
# SQLAlchemy connection string format:

DATABASE_URL = (
    "postgresql+psycopg2://neondb_owner:npg_CeS9fJg2azZD"
    "@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb"
    "?sslmode=require"
)

# Create engine and establish connection
engine = create_engine(DATABASE_URL)

In [34]:
# Insert into database
df.to_sql("abida_sultana_sat_scores", engine, schema='nyc_schools', if_exists="replace", index=False)
print("Data uploaded")

Data uploaded
