In [12]:

# Import necessary libraries
import pandas as pd
from sqlalchemy import create_engine


## Step 1: Load the Dataset

In [13]:

# Load the CSV file
df = pd.read_csv(r"C:\Users\oytun\OneDrive\Masaüstü\Intership Fıles\sat-results.csv")
df.head()


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,SAT Critical Readng Avg. Score,internal_school_id,contact_extension,pct_students_tested,academic_tier_rating
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363,355,218160,x345,78%,2.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366,383,268547,x234,,3.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370,377,236446,x123,,3.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359,414,427826,x123,92%,4.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384,390,672714,x123,92%,2.0


## Step 2: Explore the Data

In [14]:

# View data types and basic statistics
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              493 non-null    object 
 1   SCHOOL NAME                      493 non-null    object 
 2   Num of SAT Test Takers           493 non-null    object 
 3   SAT Critical Reading Avg. Score  493 non-null    object 
 4   SAT Math Avg. Score              493 non-null    object 
 5   SAT Writing Avg. Score           493 non-null    object 
 6   SAT Critical Readng Avg. Score   493 non-null    object 
 7   internal_school_id               493 non-null    int64  
 8   contact_extension                388 non-null    object 
 9   pct_students_tested              376 non-null    object 
 10  academic_tier_rating             402 non-null    float64
dtypes: float64(1), int64(1), object(9)
memory usage: 42.5+ KB


Unnamed: 0,internal_school_id,academic_tier_rating
count,493.0,402.0
mean,562172.943205,2.564677
std,262138.627055,1.126443
min,101855.0,1.0
25%,332013.0,2.0
50%,587220.0,3.0
75%,782993.0,4.0
max,999398.0,4.0


In [15]:
# After you run your normalization…
df.columns = df.columns.str.strip() \
                       .str.lower() \
                       .str.replace(' ', '_') \
                       .str.replace(r'[^0-9a-z_]+', '', regex=True)

# Now inspect
print(df.columns.tolist())


['dbn', 'school_name', 'num_of_sat_test_takers', 'sat_critical_reading_avg_score', 'sat_math_avg_score', 'sat_writing_avg_score', 'sat_critical_readng_avg_score', 'internal_school_id', 'contact_extension', 'pct_students_tested', 'academic_tier_rating']


In [16]:
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
dbn                                 0
school_name                         0
num_of_sat_test_takers              0
sat_critical_reading_avg_score      0
sat_math_avg_score                  0
sat_writing_avg_score               0
sat_critical_readng_avg_score       0
internal_school_id                  0
contact_extension                 105
pct_students_tested               117
academic_tier_rating               91
dtype: int64


## Step 3: Clean the Data

In [17]:

# Convert SAT scores to numeric, coerce invalid entries to NaN
df['sat_math_avg_score'] = pd.to_numeric(df['sat_math_avg_score'], errors='coerce')
df['sat_critical_readng_avg_score'] = pd.to_numeric(df['sat_critical_readng_avg_score'], errors='coerce')
df['sat_writing_avg_score'] = pd.to_numeric(df['sat_writing_avg_score'], errors='coerce')

# Normalize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Select relevant columns
selected_columns = [
    'dbn',
    'school_name',
    'sat_math_avg_score',
    'sat_critical_reading_avg_score',
    'sat_writing_avg_score'
]
cleaned_df = df[selected_columns]

# Preview cleaned data
cleaned_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dbn                             493 non-null    object 
 1   school_name                     493 non-null    object 
 2   sat_math_avg_score              435 non-null    float64
 3   sat_critical_reading_avg_score  493 non-null    object 
 4   sat_writing_avg_score           435 non-null    float64
dtypes: float64(2), object(3)
memory usage: 19.4+ KB


In [18]:
# Select relevant columns
selected_columns = [
    'dbn',
    'school_name',
    'sat_math_avg_score',
    'sat_critical_reading_avg_score',
    'sat_writing_avg_score'
]
cleaned_df = df[selected_columns]


## Step 4: Save Cleaned Data to CSV

In [19]:

# Save cleaned data to CSV
cleaned_df.to_csv('cleaned_sat_results.csv', index=False)


## Step 5: Insert Cleaned Data into PostgreSQL

In [None]:

# Connect to PostgreSQL and insert data
# Replace with your actual credentials
engine = create_engine('postgresql://oytuncevre@localhost:8888/sat_data')

# Append data to 'sat_scores' table
cleaned_df.to_sql('sat_scores', engine, if_exists='append', index=False)
