Step 1: SAT Dataset: Load & Inspect

In [74]:
import pandas as pd

csv_path = "/Users/hebifou/Desktop/_onboarding_data/daily_tasks/day_4/day_4_datasets/sat-results.csv"

df = pd.read_csv(csv_path)

print("Shape:", df.shape)
print("Columns:", list(df.columns))
df.head()


Shape: (493, 11)
Columns: ['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


Step 2 – Basic Structure & Columns

In [76]:
#shape: number of rows & columns
print("Shape:", df.shape)

#column names
print("\nColumns:")
print(list(df.columns))

#data types & non-null counts
print("\nInfo:")
df.info()


Shape: (493, 11)

Columns:
['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'SAT Critical Readng Avg. Score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-

Step 2.1 – Missing Values & Duplicates

In [77]:

#missing values count
print("Missing values per column:")
print(df.isna().sum().sort_values(ascending=False))

#duplicate DBN check
dupe_count = df.duplicated(subset=["DBN"]).sum()
print(f"\nDuplicate DBN count: {dupe_count}")

if dupe_count > 0:
    print("\nDuplicate DBNs found:")
    display(df[df.duplicated(subset=["DBN"], keep=False)].sort_values("DBN"))


Missing values per column:
pct_students_tested                117
contact_extension                  105
academic_tier_rating                91
DBN                                  0
SCHOOL NAME                          0
Num of SAT Test Takers               0
SAT Critical Reading Avg. Score      0
SAT Math Avg. Score                  0
SAT Writing Avg. Score               0
SAT Critical Readng Avg. Score       0
internal_school_id                   0
dtype: int64

Duplicate DBN count: 15

Duplicate DBNs found:


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
35,02M419,LANDMARK HIGH SCHOOL,62,390,399,381,390,166135,x123,78%,2.0
486,02M419,LANDMARK HIGH SCHOOL,62,390,399,381,390,166135,x123,78%,2.0
52,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0
484,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0
491,02M520,MURRY BERGTRAUM HIGH SCHOOL FOR BUSINESS CAREERS,264,407,440,393,407,892839,,92%,2.0
99,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
490,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
487,05M304,MOTT HALL HIGH SCHOOL,54,413,399,398,413,296405,x123,78%,2.0
481,07X221,SOUTH BRONX PREPARATORY: A COLLEGE BOARD SCHOOL,65,364,378,348,364,277389,x345,92%,
492,07X221,SOUTH BRONX PREPARATORY: A COLLEGE BOARD SCHOOL,65,364,378,348,364,277389,x345,92%,


2.3 - Numeric Columns Check

In [78]:

score_cols = [
    "SAT Critical Reading Avg. Score",
    "SAT Math Avg. Score",
    "SAT Writing Avg. Score"
]
count_col = "Num of SAT Test Takers"

for col in [count_col] + score_cols:
    print(f"\n--- {col} ---")
    s = df[col].astype(str).str.strip()
    non_numeric = s[~s.str.match(r"^\d+$", na=False)]
    print(f"Non-numeric values sample: {non_numeric.unique()[:5]}")
    
    nums = pd.to_numeric(s, errors="coerce")
    print(f"Min: {nums.min()}, Max: {nums.max()}")
    
    if "SAT" in col:
        bad_range = df[(nums.notna()) & ~nums.between(200, 800)]
        print(f"Rows out of range (200–800): {len(bad_range)}")



--- Num of SAT Test Takers ---
Non-numeric values sample: ['s']
Min: 6.0, Max: 1277.0
Rows out of range (200–800): 387

--- SAT Critical Reading Avg. Score ---
Non-numeric values sample: ['s']
Min: 279.0, Max: 679.0
Rows out of range (200–800): 0

--- SAT Math Avg. Score ---
Non-numeric values sample: ['s' '-10']
Min: -10.0, Max: 1100.0
Rows out of range (200–800): 5

--- SAT Writing Avg. Score ---
Non-numeric values sample: ['s']
Min: 286.0, Max: 682.0
Rows out of range (200–800): 0


 Step 2.4 – Percentage Column Analysis

In [80]:
#converting to string for uniform processing
pct = df["pct_students_tested"].astype(str).str.strip()

print("=== Top 10 raw values ===")
print(pct.value_counts(dropna=False).head(10))

#checking valid percentage format
pct_is_pct = pct.str.match(r"^\d{1,3}%$")  # 0% to 999%, realistically < 100%
print(f"\nValid % format count: {pct_is_pct.sum()}")

#checking invalid or non-percentage values
pct_invalid = ~pct_is_pct & ~pct.isna()
print(f"Invalid/non-percent entries count: {pct_invalid.sum()}")

#sample invalid values
print("\nSample invalid values:")
print(pct[pct_invalid].unique()[:10])

#distribution for valid % values (converting to float and describe)
pct_numeric = pd.to_numeric(pct[pct_is_pct].str.replace("%", ""), errors="coerce")
print("\nValid % values statistics:")
print(pct_numeric.describe())


=== Top 10 raw values ===
pct_students_tested
78%    134
85%    125
nan    117
92%    117
Name: count, dtype: int64

Valid % format count: 376
Invalid/non-percent entries count: 117

Sample invalid values:
['nan']

Valid % values statistics:
count    376.000000
mean      84.683511
std        5.718120
min       78.000000
25%       78.000000
50%       85.000000
75%       92.000000
max       92.000000
Name: pct_students_tested, dtype: float64


Step 2.5 – Categorical Columns Analysis

In [81]:
#contact_extension analysis
print("=== contact_extension ===")
print(f"Non-null count: {df['contact_extension'].notna().sum()}")
print("Unique values (up to 10):")
print(df["contact_extension"].dropna().unique()[:10])
print("\nValue counts (top 10):")
print(df["contact_extension"].value_counts().head(10))

#academic_tier_rating analysis
print("\n=== academic_tier_rating ===")
print(f"Non-null count: {df['academic_tier_rating'].notna().sum()}")
print("Unique values:")
print(df["academic_tier_rating"].dropna().unique())
print("\nValue counts (including NaN):")
print(df["academic_tier_rating"].value_counts(dropna=False).sort_index())


=== contact_extension ===
Non-null count: 388
Unique values (up to 10):
['x345' 'x234' 'x123']

Value counts (top 10):
contact_extension
x345    144
x234    123
x123    121
Name: count, dtype: int64

=== academic_tier_rating ===
Non-null count: 402
Unique values:
[2. 3. 4. 1.]

Value counts (including NaN):
academic_tier_rating
1.0     93
2.0    101
3.0     96
4.0    112
NaN     91
Name: count, dtype: int64


Step 3 – Cleaning & Normalization 

In [82]:
#droping irrelevant columns
df_clean = df.drop(columns=["SAT Critical Readng Avg. Score", "contact_extension"])

#removing exact duplicate rows (keep first occurrence)
df_clean = df_clean.drop_duplicates()

#converting numeric columns to numeric type, invalid → NaN
num_cols = [
    "Num of SAT Test Takers",
    "SAT Critical Reading Avg. Score",
    "SAT Math Avg. Score",
    "SAT Writing Avg. Score"
]

for col in num_cols:
    df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce")

#SAT score range check (200–800)
score_cols = [
    "SAT Critical Reading Avg. Score",
    "SAT Math Avg. Score",
    "SAT Writing Avg. Score"
]
for col in score_cols:
    df_clean.loc[~df_clean[col].between(200, 800, inclusive="both"), col] = np.nan

#converting pct_students_tested to decimal
def pct_to_float(x):
    if pd.isna(x):
        return np.nan
    try:
        return float(str(x).replace("%", "").strip()) / 100
    except:
        return np.nan

df_clean["pct_students_tested"] = df_clean["pct_students_tested"].apply(pct_to_float)

#normalizing column names to snake_case
df_clean.columns = (
    df_clean.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace(".", "", regex=False)
)

#preview cleaned data
df_clean.head()


Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score,internal_school_id,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29.0,355.0,404.0,363.0,218160,0.78,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91.0,383.0,423.0,366.0,268547,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70.0,377.0,402.0,370.0,236446,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7.0,414.0,401.0,359.0,427826,0.92,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44.0,390.0,433.0,384.0,672714,0.92,2.0


In [83]:
output_path = "cleaned_sat_results.csv"
df_clean.to_csv(output_path, index=False)
print(f"Cleaned CSV saved to: {output_path}")


Cleaned CSV saved to: cleaned_sat_results.csv


In [84]:
print(df_clean.columns.tolist())


['dbn', 'school_name', 'num_of_sat_test_takers', 'sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score', 'internal_school_id', 'pct_students_tested', 'academic_tier_rating']


In [None]:
import psycopg2

#connecting to PostgreSQL on Render
conn = psycopg2.connect(
    host="dpg-d2e5uuer433s73d38hbg-a.frankfurt-postgres.render.com",
    port="5432",
    dbname="sat_scores_db",
    user="sat_scores_db_user",
    password="H1TSGu65ibUgm1YjymgaotoB0hR7CRtG"
)
cur = conn.cursor()

#dropping table if exists (only if re-creating)
cur.execute("DROP TABLE IF EXISTS cleaned_sat_results;")

#creating table with TEXT columns
create_table_query = """
CREATE TABLE IF NOT EXISTS cleaned_sat_results (
    dbn TEXT PRIMARY KEY,
    school_name TEXT,
    num_of_sat_test_takers TEXT,
    sat_critical_reading_avg_score TEXT,
    sat_math_avg_score TEXT,
    sat_writing_avg_score TEXT,
    internal_school_id TEXT,
    pct_students_tested TEXT,
    academic_tier_rating TEXT
);
"""
cur.execute(create_table_query)
conn.commit()

print("Table 'cleaned_sat_results' created with all columns as TEXT!")

#closing connection
cur.close()
conn.close()


Table 'cleaned_sat_results' created with all columns as TEXT!


In [85]:


#loading the cleaned CSV file
df = pd.read_csv("cleaned_sat_results.csv")

#connecting to PostgreSQL database on Render
conn = psycopg2.connect(
    host="dpg-d2e5uuer433s73d38hbg-a.frankfurt-postgres.render.com",
    port="5432",
    dbname="sat_scores_db",
    user="sat_scores_db_user",
    password="H1TSGu65ibUgm1YjymgaotoB0hR7CRtG"
)
cur = conn.cursor()

#inserting data into the 'cleaned_sat_results' table
for _, row in df.iterrows():
    cur.execute("""
        INSERT INTO cleaned_sat_results (
            dbn, school_name, num_of_sat_test_takers, 
            sat_critical_reading_avg_score, sat_math_avg_score, sat_writing_avg_score,
            internal_school_id, pct_students_tested, academic_tier_rating
        ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
        ON CONFLICT (dbn) DO NOTHING;
    """, tuple(row))

conn.commit()
print("Data from cleaned_sat_results.csv successfully inserted into 'cleaned_sat_results'!")

#preview first 5 rows from DB
cur.execute("SELECT * FROM cleaned_sat_results LIMIT 5;")
for row in cur.fetchall():
    print(row)

#closing the database connection
cur.close()
conn.close()


Data from cleaned_sat_results.csv successfully inserted into 'cleaned_sat_results'!
('01M292', 'HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES', '29.0', '355.0', '404.0', '363.0', '218160', '0.78', '2.0')
('01M448', 'UNIVERSITY NEIGHBORHOOD HIGH SCHOOL', '91.0', '383.0', '423.0', '366.0', '268547', 'NaN', '3.0')
('01M450', 'EAST SIDE COMMUNITY SCHOOL', '70.0', '377.0', '402.0', '370.0', '236446', 'NaN', '3.0')
('01M458', 'FORSYTH SATELLITE ACADEMY', '7.0', '414.0', '401.0', '359.0', '427826', '0.92', '4.0')
('01M509', 'MARTA VALLE HIGH SCHOOL', '44.0', '390.0', '433.0', '384.0', '672714', '0.92', '2.0')


Note: 

In my PostgreSQL table "cleaned_sat_results", I decided to store all columns as "TEXT" to avoid
the "NumericValueOutOfRange" error I was getting when trying to insert certain large values.  
The local file "cleaned_sat_results.csv" still keeps proper numeric types (integers, floats) so I
can run calculations in Pandas without any issues.  
If I need to do math directly in SQL later, I’ll just cast the values to numbers in the query.
