<h1>Data Extraction</h1>

In [None]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
from dotenv import load_dotenv
import os
import json

In [565]:
# file names
SCHOOL_FED = "federal_college_data.csv"
SCHOOL_FORBES = "forbes_rankings.csv"
SCHOOL_RATINGS = "myplan_rankings.csv"
CITY_ZIPS = "us_cities_zip_county.csv"
CITY_ZIPS_JSON = "us_cities_zip_county.json"
CITY_CRIME = "us_cities_crime.csv"
CITY_POP = "us_cities_pop.csv" # not used in final analysis
# Zippopotam API used in the data_integration code to match zips to cities in the CITY_CRIME file

In [235]:
STATE_ABBREVS = {
    "alabama": "AL",
    "alaska": "AK",
    "arizona": "AZ",
    "arkansas": "AR",
    "california": "CA",
    "colorado": "CO",
    "connecticut": "CT",
    "delaware": "DE",
    "florida": "FL",
    "georgia": "GA",
    "hawaii": "HI",
    "idaho": "ID",
    "illinois": "IL",
    "indiana": "IN",
    "iowa": "IA",
    "kansas": "KS",
    "kentucky": "KY",
    "louisiana": "LA",
    "maine": "ME",
    "maryland": "MD",
    "massachusetts": "MA",
    "michigan": "MI",
    "minnesota": "MN",
    "mississippi": "MS",
    "missouri": "MO",
    "montana": "MT",
    "nebraska": "NE",
    "nevada": "NV",
    "new hampshire": "NH",
    "new jersey": "NJ",
    "new mexico": "NM",
    "new york": "NY",
    "north carolina": "NC",
    "north dakota": "ND",
    "ohio": "OH",
    "oklahoma": "OK",
    "oregon": "OR",
    "pennsylvania": "PA",
    "rhode island": "RI",
    "south carolina": "SC",
    "south dakota": "SD",
    "tennessee": "TN",
    "texas": "TX",
    "utah": "UT",
    "vermont": "VT",
    "virginia": "VA",
    "washington": "WA",
    "west virginia": "WV",
    "wisconsin": "WI",
    "wyoming": "WY",
    "district of columbia": "DC"
}

<h2>Validate CSV File Name</h2>
All data will be saved to CSV files upon intial extraction. The function below is used in scraping of all three sources to verify that the input filename is valid.

In [80]:
def validate_filename(filename: str) -> bool:

    # check for correct file extension
    if not filename.lower().endswith(".csv"):
        print("Error: Filename must end with .csv")
        return False

    # check for invalid characters
    invalid_characters = r'[<>:"/\\|?*\']'
    if re.search(invalid_characters, filename):
        print("Error: Filename contains invalid characters.")
        return False

    # check for empty or whitespace-only name
    if filename.strip() == ".csv":
        print("Error: Filename cannot be empty or just whitespace.")
        return False

    return True

In [145]:
# test file name verification function
validate_filename(".csv")
validate_filename("colleges")
validate_filename("colleges?.csv")
validate_filename("colleges.CSV")

Error: Filename cannot be empty or just whitespace.
Error: Filename must end with .csv
Error: Filename contains invalid characters.


True

<h2>Forbes Rankings</h2>
<h3>Variables of Interest</h3>
Institution Name, Rank, State, Average Grade, Median Base Salary, Student Population, Campus Setting, School Size, Description, Institution Type, Carnegie Classification, Student to Faculty Ratio, Total Grant Aid, Percent of Students Receive Financial Aid, Percent of Students Receive Grants

In [569]:
def get_forbes_data(filename: str):
    # JSON API endpoint url -- obtained from browser’s Network tab in Developer Tools
    # original data url - https://www.forbes.com/top-colleges/
    FORBES_URL = "https://www.forbes.com/forbesapi/org/top-colleges/2025/rank/true.json?fields=organizationName,academics,state,financialAid,rank,medianBaseSalary,campusSetting,studentPopulation,squareImage,uri,description,grade,schoolSize&limit=500&start=0"
    headers = {'User-Agent': 'Mozilla/5.0'}
    universities = []
    
    # validate filename to ensure that the data can be downloaded into a CSV file
    if validate_filename(filename):
        pass
    else:
        return f"Invalid filename: {filename}. Please check your filename and try again."
    
    # call page and show error if unable to make the request
    try:
        response = requests.get(FORBES_URL, headers=headers)
        data = response.json()
        schools = data['organizationList']['organizationsLists']
    except requests.exceptions.RequestException as e:
        # helps diagnose issues with HTTP request
        print(f"Request error: {e}")
        return None
    except KeyError as e:
        # helps diagnose API structure changes
        print(f"Key error: {e}")
        print("Available keys in response:", data.keys())
        return None
    
    # loop through schools and collect data
    for school in schools:
        university = {
            'rank': school.get('rank'),
            'name': school.get('organizationName'),
            'state': school.get('state'),
            'grade': school.get('grade'),
            'medianBaseSalary': school.get('medianBaseSalary'),
            'studentPopulation': school.get('studentPopulation'),
            'campusSetting': school.get('campusSetting'),
            'schoolSize': school.get('schoolSize'),
            'description': school.get('description'),
            'uri': school.get('uri')
        }
    
        academics = school.get('academics', {})
        university['institutionType'] = academics.get('type')
        university['carnegieClassification'] = academics.get('carnegieClassification')
        university['studentFacultyRatio'] = academics.get('studentFacultyRatio')
    
        financial_aid = school.get('financialAid', {})
        university['totalGrantAid'] = financial_aid.get('totalGrantAid')
        university['percentOfStudentsFinAid'] = financial_aid.get('percentOfStudentsFinAid')
        university['percentOfStudentsGrant'] = financial_aid.get('percentOfStudentsGrant')
        
        # add all institution data to universities list, which can then be appended to a CSV file
        universities.append(university)
    
    # create dataframe from university data and extract into a CSV file
    df = pd.DataFrame(universities)
    df.to_csv(filename, index=False)

    return f"{len(universities)} universities added to {filename}."

In [571]:
get_forbes_data(SCHOOL_FORBES)

'500 universities added to forbes_rankings.csv.'

<h2>myPlan Data</h2>
<h3>Variables of Interest</h3>
Institution Name, Prestige, Satisfaction, Resources & Facilities, Personal Safety, Teacher Support and Involvement, School Administration, Campus Setting, Aggregate score of all variables (Average Score)

In [573]:
def get_myplan_data(url: str, entries: int):
    # offset used to browse through page URLs
    # an offset of 0 lists schools from 1-100, an offset of 400 lists schools from 401-500
    # to get all instiutions, the amount of entries per variable will be input by the user to account for an increase of schools in the future
    # ex. if 596 entries, offset would go up to 500 to list schools from 501-596
    offset = ["0"]
    entries = str(entries)
    
    def get_offset():
        count = 1
        if len(entries) == 1 or len(entries) == 2:
            pass
        elif len(entries) == 3:
            while count <= int(entries[0]):
                offset.append(str(count) + "00")
                count += 1 
        elif len(entries) == 4:
            while count <= int(entries[0:2]):
                offset.append(str(count) + "00")
                count += 1 

    get_offset()   
    
    university_rankings = []
    
    for i in offset:
        page_url = url + i

        # access url
        headers = {"User-Agent": "Mozilla/5.0"}  # Prevents blocking
        html = requests.get(page_url, headers=headers).text
        soup = BeautifulSoup(html, "html.parser")

        # loacte part of the HTML structure that contains university data
        td_element = soup.find("td", {"background": "../../images/career_details_panel_bg_long.gif"})

        if td_element:
            rows = td_element.find_all("tr")
            for row in rows:
                cells = row.find_all("td")
                if len(cells) >= 3: 
                    name_tag = cells[1].find("a", class_="ratings_list")
                    score_tag = cells[2].find("div", align="right")
                    
                    if name_tag and score_tag:
                        name = name_tag.get_text(strip=True)
                        score = score_tag.get_text(strip=True)
                        try: #skip invalid scores
                            float(score)
                            university_rankings.append((name, score))
                        except ValueError:
                            continue
        else:
            print(f"No data found for offset {i}")
    
    print(f"{len(university_rankings)} total entries were found and documented.")
    return university_rankings


# merge data from all school variables into a csv sheet
# calculate avg score based on all variables

def merge_and_save_data(variables_list:list, filename:str):
    # define headers at the top of function for increased extensibility
    HEADERS = [
        "School", "Prestige", "Satisfaction", "Resources & Facilities",
        "Safety", "Teacher Support", "School Administration", "Campus Setting", "Average Score"
    ]
    
    # validate CSV file name
    if validate_filename(filename):
        pass
    else:
        return f"Invalid filename: {filename}. Please check your filename and try again."
    
    num_variables = len(variables_list)
    merged = {}
    
    # loop through each variable in ranking list and its index
    # adds data into a dictionary with school name and associated scores
    for idx, ranking_list in enumerate(variables_list):
        for name, score in ranking_list:
            if name not in merged:
                merged[name] = [None] * num_variables  # initialize with None to signal missing values
            merged[name][idx] = float(score)
    
    # convert to list of tuples: (School Name, score1, ..., score7, average)
    merged_list = []
    for name, scores in merged.items():
        non_none_scores = [score for score in scores if score is not None]
        if non_none_scores:
            avg_score = sum(non_none_scores) / len(non_none_scores)
        else:
            avg_score = None
        merged_list.append((name, *scores, round(avg_score, 2)))
    
    # save as csv
    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(HEADERS)
        writer.writerows(merged_list)

    print(f"Saved to {filename}")


In [575]:
# get myPlan ranking data from unique page per variable

variable_sources = [
    ("prestige", "https://www.myplan.com/education/colleges/college_rankings_8.php?sort=1&offset=", 611),
    ("satisfaction", "https://www.myplan.com/education/colleges/college_rankings_1.php?sort=1&offset=", 613),
    ("resources", "https://www.myplan.com/education/colleges/college_rankings_4.php?sort=1&offset=", 611),
    ("safety", "https://www.myplan.com/education/colleges/college_rankings_5.php?sort=1&offset=", 612),
    ("teacher_support", "https://www.myplan.com/education/colleges/college_rankings_6.php?sort=1&offset=", 611),
    ("school_admin", "https://www.myplan.com/education/colleges/college_rankings_7.php?sort=1&offset=", 610),
    ("campus", "https://www.myplan.com/education/colleges/college_rankings_2.php?sort=1&offset=", 613)
]

rankings = {}

for name, url, entries in variable_sources:
    rankings[name] = get_myplan_data(url, entries)

611 total entries were found and documented.
613 total entries were found and documented.
611 total entries were found and documented.
612 total entries were found and documented.
611 total entries were found and documented.
610 total entries were found and documented.
613 total entries were found and documented.


In [576]:
# merge data and save to csv file
# 611 entries total
merge_and_save_data(
    [rankings["prestige"], rankings["satisfaction"], rankings["resources"],
     rankings["safety"], rankings["teacher_support"], rankings["school_admin"],
     rankings["campus"]], 
    SCHOOL_RATINGS
)

Saved to myplan_rankings.csv


<h2>Department of Education API</h2>

<h3>Variables Of Interest</h3>
Name - name - INSTNM (TEXT) <br>
State - school.state (TEXT) <br>
Admission rate - admission_rate.overall - ADM_RATE (FLOAT) <br>  
SAT Scores - admissions.sat_scores.average.overall <br>
Enrollment of all undergraduate students - enrollment.all - UG (INT)  <br> 
Average net price for Title IV institutions (public institutions) - avg_net_price.public - NPT4_PUB (INT) <br>  
Average net price for Title IV institutions (private for-profit and nonprofit institutions) - avg_net_price.private - NPT4_PRIV (INT)<br> 
Average cost of attendance (academic year institutions) - attendance.academic_year - COSTT4_A (INT)<br> 
The median debt for students who have completed - median_debt.completers.overall - GRAD_DEBT_MDN (FLOAT)<br> 
Median earnings of students working and not enrolled 8 years after entry - 8_yrs_after_entry.median_earnings - MD_EARN_WNE_P8 (FLOAT) <br>  
Mean earnings of students working and not enrolled 8 years after entry - 8_yrs_after_entry.mean_earnings - MN_EARN_WNE_P8 (FLOAT)  <br> 

In [None]:
# example request
# https://api.data.gov/ed/collegescorecard/v1/schools?api_key=YOUR_API_KEY&fields=id,school.name,latest.cost.tuition.in_state,latest.completion.rate

# to request API key, visit: https://collegescorecard.ed.gov/data/api-documentation
# API default rate limit - 1,000 requests per IP address per hour
load_dotenv()  # Load from .env
secret_key = os.getenv("API_KEY")
API_KEY = secret_key
BASE_URL = "https://api.data.gov/ed/collegescorecard/v1/schools"

# variables of interest [add here to include additional variables in research]
FIELDS = [
    "school.zip",
    "school.name",
    "school.city",
    "school.state",
    "latest.school.locale",
    "latest.school.faculty_salary",
    "latest.admissions.sat_scores.average.overall",
    "latest.admissions.admission_rate.overall",
    "latest.completion.title_iv.completed_by.4yrs",
    "latest.cost.avg_net_price.public",
    "latest.cost.avg_net_price.private",
    "latest.cost.attendance.academic_year",
    "latest.aid.median_debt.completers.overall",
    "latest.earnings.8_yrs_after_entry.median_earnings",
    "latest.earnings.8_yrs_after_entry.mean_earnings"
]

HEADERS = [
    "Zip Code",
    "School Name",
    "City",
    "State",
    "Locale",
    "Average Faculty Salary",
    "Average SAT Score",
    "Admission Rate",
    "4-Year Completion Rate",
    "Average Net Price (Public)",
    "Average Net Price (Private)",
    "Cost of Attendance (Academic Year)",
    "Median Debt of Completers",
    "Median Earnings (8 Years After Entry)",
    "Mean Earnings (8 Years After Entry)"
]

def get_data_to_csv(start_page, end_page, filename):
    # validate CSV file name
    if validate_filename(filename):
        pass
    else:
        return f"Invalid filename: {filename}. Please check your file name and try again."
    
    all_data = []
    page = start_page
    
    while page < end_page:
        params = {
            "api_key": API_KEY,
            "fields": ",".join(FIELDS),
            "per_page": 100,
            "page": page
        }

        response = requests.get(BASE_URL, params=params)

        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            print(response.text)
            break

        data = response.json()
        results = data.get("results", [])

        def safe_get(school, field):
            value = school.get(field)
            return value if value is not None else "N/A"
        
        for school in results:
            row = [safe_get(school, field) for field in FIELDS]
            all_data.append(row)

        page += 1
    
    # after fetching all data, save to CSV with human readable headers
    # there are thousands of institutions in this API, so we will extract data in portions to check for error
    # use append mode to add data at the end of the CSV file to prevent overwriting previous entries
    file_exists = os.path.isfile(filename) # use os to prevent rewriting headers
    with open(filename, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        if not file_exists:
            writer.writerow(HEADERS)
        writer.writerows(all_data)
 
    print(f"Data from pages {start_page} to {end_page} saved to '{filename} - {len(all_data)} records")

In [581]:
# run in batches to check for errors and prevent overloading API
# 6384 school entries total

intervals = [(0,10), (10,20), (20,30), (30,40), (40,50), (50,60), (60,65)]

for i in intervals:
    get_data_to_csv(i[0], i[1], SCHOOL_FED)

Data from pages 0 to 10 saved to 'federal_college_data.csv - 1000 records
Data from pages 10 to 20 saved to 'federal_college_data.csv - 1000 records
Data from pages 20 to 30 saved to 'federal_college_data.csv - 1000 records
Data from pages 30 to 40 saved to 'federal_college_data.csv - 1000 records
Data from pages 40 to 50 saved to 'federal_college_data.csv - 1000 records
Data from pages 50 to 60 saved to 'federal_college_data.csv - 1000 records
Data from pages 60 to 65 saved to 'federal_college_data.csv - 429 records


<h2>US City Population Data</h2>
US Cities by State and their population

In [122]:
def get_city_info(state, abbrev, filename):
    # validate CSV file name
    if validate_filename(filename):
        pass
    else:
        return f"Invalid filename: {filename}. Please check your file name and try again."
    
    url = f"https://www.geonames.org/postal-codes/US/{abbrev}/{state}.html"
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    table = soup.find("table", {"class": "restable"})
    rows = table.find_all("tr")[1:]  # Skip header row
    
    data = []
    
    for row in rows:
        cells = row.find_all("td")
        if len(cells) >= 5:
            city = cells[1].get_text(strip=True)
            postal_code = cells[2].get_text(strip=True)
            state = cells[4].get_text(strip=True)
            county = cells[5].get_text(strip=True)
    
            data.append({
                "City": city,
                "Postal Code": postal_code,
                "State": state,
                "County": county,
            })
    
    try:
        file_exists = os.path.isfile(filename)
    
        with open(filename, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=data[0].keys())
            
            if not file_exists:
                writer.writeheader()
                
            writer.writerows(data)
        return f"Data from {state} has been added to {filename}!"
    except:
        return f"Unable to add {state} data to {filename}."

In [551]:
# get data as csv
for state, abbrev in STATE_ABBREVS.items():
    get_city_info(state, abbrev, CITY_ZIPS)

In [556]:
# convert CITY_ZIPS file to JSON for quicker assess when merging data sets
def csv_to_json(csv_filepath, json_filepath):
    data = []
    with open(csv_filepath, 'r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        for row in csv_reader:
            data.append(row)

    with open(json_filepath, 'w') as jsonfile:
        json.dump(data, jsonfile, indent=4)

In [None]:
csv_to_json(CITY_ZIPS, CITY_ZIPS_JSON)

In [110]:
#this source was obtained, but not used because I found that my crime statistics data source also had population data
#this source was redundant for my research, so i excluded it from my analysis

def get_city_pop(state, filename):
    # validate CSV file name
    if validate_filename(filename):
        pass
    else:
        return f"Invalid filename: {filename}. Please check your file name and try again."
    
    # make custom state URL and get data
    url = f"https://www.city-data.com/city/{state}.html"
    response = requests.get(url)
    html = response.text
    
    # access table from custom url
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table", id="cityTAB")

    if not table:
        return f"No data table found for {state}."

    rows = table.find_all("tr")[1:]

    data = []
    for row in rows:
        cells = row.find_all("td")
        if len(cells) >= 3:
            city_name = cells[1].get_text(strip=True)
            city_name_clean = city_name.split(",")[0]
            population = cells[2].get_text(strip=True).replace(',', '')  # Remove commas
            data.append({"State": state, "City": city_name_clean, "Population": population})
    
    try:
        df = pd.DataFrame(data)
        file_exists = os.path.isfile(filename)
        df.to_csv(filename, mode='a', header=not file_exists, index=False)
        return f"{state} data has been added to {filename}!"
    except Exception as e:
        return f"Unable to add {state} data to {filename}. Error: {e}"

In [112]:
#US states based on url naming convention for this source
us_states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", 
    "District-of-Columbia", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", 
    "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", 
    "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", 
    "New-Hampshire", "New-Jersey", "New-Mexico", "New-York", "North-Carolina", 
    "North-Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode-Island", 
    "South-Carolina", "South-Dakota", "Tennessee", "Texas", "Utah", "Vermont", 
    "Virginia", "Washington", "West-Virginia", "Wisconsin", "Wyoming"
]

for state in us_states:
    get_city_pop(state, CITY_POP)

Alabama data has been added to us_cities_pop.csv!
Alaska data has been added to us_cities_pop.csv!
Arizona data has been added to us_cities_pop.csv!
Arkansas data has been added to us_cities_pop.csv!
California data has been added to us_cities_pop.csv!
Colorado data has been added to us_cities_pop.csv!
Connecticut data has been added to us_cities_pop.csv!
District-of-Columbia data has been added to us_cities_pop.csv!
Delaware data has been added to us_cities_pop.csv!
Florida data has been added to us_cities_pop.csv!
Georgia data has been added to us_cities_pop.csv!
Hawaii data has been added to us_cities_pop.csv!
Idaho data has been added to us_cities_pop.csv!
Illinois data has been added to us_cities_pop.csv!
Indiana data has been added to us_cities_pop.csv!
Iowa data has been added to us_cities_pop.csv!
Kansas data has been added to us_cities_pop.csv!
Kentucky data has been added to us_cities_pop.csv!
Louisiana data has been added to us_cities_pop.csv!
Maine data has been added to us