<h1>Data Integration</h1>
Merge data files into one, based off of schools listed in the Forbes Rankings. Ensure that all data has been properly transferred to one sheet.

In [None]:
import pandas as pd
import json
import numpy as np
import requests
import time
from collections import defaultdict

In [144]:
# file names
SCHOOL_FED = "federal_college_data.csv"
SCHOOL_FORBES = "forbes_rankings.csv"
SCHOOL_RATINGS = "myplan_rankings.csv"
CITY_ZIPS = "us_cities_zip_county.csv"
CITY_ZIPS_JSON = "us_cities_zip_county.json"
CITY_CRIME = "us_cities_crime.csv"

# merged files
SCHOOL_ALL = "schools_merged.csv"
SCHOOL_COMPLETE = "schools_complete.csv"
CITY_CRIME_ZIPS = "us_cities_crime_zips.csv"
CITY_ZIPS_JSON_UPDATED = "updated_zips.json" # city state "xx" zip_code"xxxxx"
FINAL_CITY_CRIME_ZIPS = "us_cities_crime_zips2.csv"

<h3>Make List of All Unique School Names</h3>

In [8]:
# make a list of all schools to begin the standardization process and to refer to when merging

ALL_SCHOOLS = set()

def make_schools_list(files_dict: dict):
    for file, column in files_dict.items():
        df = pd.read_csv(file)
        df.columns = df.columns.str.lower().str.strip()
        column = column.lower().strip()

        if column not in df.columns:
            print(f"Column '{column}' not found in {file}")
            continue

        df[column] = df[column].astype(str).str.lower().str.strip()
        ALL_SCHOOLS.update(df[column].unique())

    return ALL_SCHOOLS

In [14]:
school_files_columns = {SCHOOL_FED: "School Name" , SCHOOL_FORBES: "name" , SCHOOL_RATINGS: "School" }
make_schools_list(school_files_columns)
ALL_SCHOOLS_LIST = list(ALL_SCHOOLS)
print(len(ALL_SCHOOLS_LIST))

6256


In [16]:
# quality check - make sure that ALL_SCHOOLS_LIST has no duplicates - passed!
school_set = set()
dups = []

for school in ALL_SCHOOLS_LIST:
    if school in school_set:
        dups.append(school)
    else:
        school_set.add(school)

print(dups)

[]


<h3>Label School Type</h3>

In [173]:
def label_school_type(filename):
    df = pd.read_csv(filename)
    df['Type'] = None # add new column

    # label as "Public" or "Private"
    df.loc[df['Average Net Price (Public)'].notna(), 'Type'] = 'Public'
    df.loc[df['Average Net Price (Private)'].notna(), 'Type'] = 'Private'

    df.to_csv(filename, index=False)

In [175]:
label_school_type(SCHOOL_FED)

<h3>Merge All School Files Into New File</h3>

In [26]:
def merge_school_data(file_column_dict, new_filename):
    dfs = []

    for i, (file, column_name) in enumerate(file_column_dict.items()):
        # read without changing integer types to float
        df = pd.read_csv(file, dtype=str)  # read all columns as strings to preserve exact values
        df = df.rename(columns={column_name: 'school'})
        df = df.rename(columns={col: f"{col}_{i+1}" for col in df.columns if col != 'school'})
        dfs.append(df)

    # merge all DataFrames on 'school' using outer join
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = pd.merge(merged_df, df, on='school', how='outer')

    merged_df.to_csv(new_filename, index=False)

    return merged_df

In [28]:
merge_school_data(school_files_columns, SCHOOL_ALL)

Unnamed: 0,Zip Code_1,school,City_1,State_1,Locale_1,Average Faculty Salary_1,Average SAT Score_1,Admission Rate_1,4-Year Completion Rate_1,Average Net Price (Public)_1,...,percentOfStudentsFinAid_2,percentOfStudentsGrant_2,Prestige_3,Satisfaction_3,Resources & Facilities_3,Safety_3,Teacher Support_3,School Administration_3,Campus Setting_3,Average Score_3
0,87110,a better u beauty barber academy,albuquerque,NM,11.0,,,,,,...,,,,,,,,,,
1,63501,a t still university of health sciences,kirksville,MO,33.0,10244.0,,,,,...,,,,,,,,,,
2,59526,aaniiih nakoda college,harlem,MT,43.0,4725.0,,,,11811.0,...,,,,,,,,,,
3,90703,abc adult school,cerritos,CA,21.0,,,,,16096.0,...,,,,,,,,,,
4,90703,abc adult school cabrillo lane,cerritos,CA,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6576,43701,zane state college,zanesville,OH,32.0,6001.0,,,0.281800391389,7423.0,...,,,,,,,,,,
6577,60201,zen shiatsu chicago,evanston,IL,13.0,,,,,,...,,,,,,,,,,
6578,84770,zion massage college,st. george,UT,12.0,,,,,,...,,,,,,,,,,
6579,90033,zms academy,los angeles,CA,11.0,,,,0.796296296296,,...,,,,,,,,,,


<h3>Make a New File With Schools that Have All Columns Filled</h3>

In [224]:
def get_complete_schools(filename, new_filename):
    df = pd.read_csv(filename, dtype=str)
    df = df.replace(r'^\s*$', np.nan, regex=True)

    allowed_empty = [
        'Average Net Price (Public)_1',
        'Average Net Price (Private)_1',
        'institutionType_2',
        'grade_2',
        'description_2',
        'uri_2'
    ]

    required_cols = [col for col in df.columns if col not in allowed_empty]

    df_complete = df.dropna(subset=required_cols).copy()

    df_complete.loc[:, 'Average Net Price_1'] = df_complete['Average Net Price (Public)_1'].combine_first(
        df_complete['Average Net Price (Private)_1']
    )

    df_complete.drop(columns=['Average Net Price (Public)_1', 'Average Net Price (Private)_1'], inplace=True)

    df_complete.to_csv(new_filename, index=False)

In [226]:
get_complete_schools(SCHOOL_ALL, SCHOOL_COMPLETE)

<h3>Add Zip Codes To Crime Doc</h3>

In [162]:
def add_zips(csv_file, json_file, output_file):
    
    df = pd.read_csv(csv_file)
    with open(json_file, 'r') as f: #load JSON zips
        zip_records = json.load(f)

    # create dict from JSON values {(city, state): postal_code}
    zip_lookup = {}
    for entry in zip_records:
        city = entry['City'].strip().lower()
        state = entry['State'].strip().lower()
        zip_code = entry['Postal Code']
        key = (city, state)
        if key not in zip_lookup:
            zip_lookup[key] = zip_code  # only take first if multiple

    def get_zip(row):
        city = str(row["City"])
        state = str(row["State"])
        return zip_lookup.get((city, state), None)

    df["Zip Code"] = df.apply(get_zip, axis=1) #axis 1 applies func to each row, axis 0 applies func to each column
    df.to_csv(output_file, index=False)

In [164]:
#add_zips(CITY_CRIME, CITY_ZIPS_JSON, CITY_CRIME_ZIPS)
add_zips(CITY_CRIME_ZIPS, CITY_ZIPS_JSON_UPDATED, FINAL_CITY_CRIME_ZIPS)

In [146]:
# check -- count amount of empty zip codes in file
def count_missing_zips(filename):
    df = pd.read_csv(filename)
    
    # count rows where Zip Code is missing or blank
    missing_count = df["Zip Code"].isna().sum() + (df["Zip Code"].astype(str).str.strip() == "").sum()
    
    print(f"Missing Zip Code entries: {missing_count}")
    return missing_count

In [148]:
count_missing_zips(CITY_CRIME_ZIPS)
count_missing_zips(FINAL_CITY_CRIME_ZIPS) # zip code api added 1,090 new zip codes

Missing Zip Code entries: 6010
Missing Zip Code entries: 4920


4920

In [154]:
# not all zip codes were added
# use zip code api to fill remaining zips

zips_dict = {}

# query Zippopotam.us API and return the first ZIP code if found
# example request http://api.zippopotam.us/us/ny/new%20york

def get_zip_codes_from_api(state: str, city: str):

    try:
        url = f"http://api.zippopotam.us/us/{state.lower()}/{city.lower().replace(' ', '%20')}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            return data["places"][0]["post code"]  # Return the first ZIP code
    except Exception as e:
        print(f"API error for {city}, {state}: {e}")
    return None

def fill_missing_zips(filename: str):
    
    df = pd.read_csv(filename)

    if "Zip Code" not in df.columns: #check for zip code column
        df["Zip Code"] = None

    # iterate through rows where Zip Code is missing
    for i, row in df[df["Zip Code"].isna()].iterrows(): # add city and state to a dict if zip code is missing
        city = str(row["City"]).strip()
        state = str(row["State"]).strip()
        zips_dict[city] = state

    return f"Zip Dictionary complete. {len(zips_dict)} entries."

def add_zips_dict(cities_dict: dict):
    updated_dict = {}

    for city, state in cities_dict.items():
        zip_code = get_zip_codes_from_api(state, city)
        if zip_code:
            updated_dict[city] = {
                'state': state,
                'zip_code': zip_code
            }
            print(f"{city.title()}, {state.upper()} → ZIP: {zip_code or 'N/A'}")
        time.sleep(1) # delay to avoid rate-limiting

    return updated_dict  


In [160]:
print(fill_missing_zips(CITY_CRIME_ZIPS))
test_dict = {'abbeville': 'sc', 'addison': 'al', 'andalusia': 'al', 'anniston': 'al'}
#print(zips_dict)
#city_zip_dict = add_zips_dict(zips_dict)
print(len(city_zip_dict))

Zip Dictionary complete. 5051 entries.
3445


In [116]:
# save new dict to json file

def save_json(cities_dict, filename):
    formatted = [] # convert nested dict to list of dicts
    for city, info in cities_dict.items():
        zip_code = info.get("zip_code")
        state = info.get("state")
        
        if zip_code and state:
            formatted.append({
                "City": city.lower(),
                "Postal Code": zip_code,
                "State": state.lower()
            })
    
    # Save as a JSON file
    with open(filename, "w") as f:
        json.dump(formatted, f, indent=2)

In [118]:
save_json(city_zip_dict, CITY_ZIPS_JSON_UPDATED)

<h3>Add Aggregated Crime Metrics</h3>

In [287]:
def get_sum_crime_rates(filename):
    df = pd.read_csv(filename)
    df.drop(columns=['Unnamed: 13'], inplace=True) #drop unnecessary column
    df.columns = df.columns.str.replace('\n', ' ')
    print(list(df))

    cols_to_sum = [
        'Violent crime', 'Murder and nonnegligent manslaughter', 'Rape', 
        'Robbery', 'Aggravated assault', 'Property crime', 'Burglary', 
        'Larceny- theft', 'Motor vehicle theft', 'Arson1'
    ]

    # sum Crime per City
    df[cols_to_sum] = df[cols_to_sum].apply(pd.to_numeric, errors='coerce')
    df['Crime Sum'] = df[cols_to_sum].sum(axis=1)

    # calculate crime ratio based on sum & population
    df['Population'] = df['Population'].astype(str).str.replace(',', '')
    df['Population'] = pd.to_numeric(df['Population'], errors='coerce')
    df = df[df['Population'] > 0]
    
    df['Crime Ratio'] = df['Crime Sum'] / df['Population']

    df.to_csv(filename, index=False)
    return df



In [289]:
get_sum_crime_rates(FINAL_CITY_CRIME_ZIPS)

['State', 'City', 'Population', 'Violent crime', 'Murder and nonnegligent manslaughter', 'Rape', 'Robbery', 'Aggravated assault', 'Property crime', 'Burglary', 'Larceny- theft', 'Motor vehicle theft', 'Arson1', 'Zip Code']


Unnamed: 0,State,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson1,Zip Code,Crime Sum,Crime Ratio
0,al,abbeville,2371,6.0,0,0.0,0.0,6.0,27.0,6.0,21.0,0.0,0.0,,66.0,0.027836
1,al,adamsville,4158,17.0,0,1.0,5.0,11.0,201.0,23.0,158.0,20.0,0.0,,436.0,0.104858
2,al,addison,674,3.0,0,1.0,0.0,2.0,14.0,2.0,10.0,2.0,0.0,35540.0,34.0,0.050445
3,al,alabaster,34120,37.0,1,1.0,5.0,30.0,551.0,16.0,518.0,17.0,2.0,,1178.0,0.034525
4,al,albertville,22887,68.0,0,11.0,3.0,54.0,415.0,40.0,315.0,60.0,6.0,,972.0,0.042470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8360,wy,sheridan,19472,19.0,2,2.0,1.0,14.0,220.0,20.0,191.0,9.0,1.0,,479.0,0.024599
8361,wy,thermopolis,2692,1.0,0,0.0,1.0,0.0,17.0,5.0,12.0,0.0,0.0,,36.0,0.013373
8362,wy,torrington,6148,12.0,0,6.0,0.0,6.0,96.0,14.0,72.0,10.0,0.0,,216.0,0.035133
8363,wy,upton,904,1.0,0,0.0,0.0,1.0,14.0,0.0,13.0,1.0,0.0,,30.0,0.033186
