<h1>Data Cleaning</h1>
Clean data and prepare them for merging. This includes normalizing numbers, lowercasing school names, removing any footnotes, and finding similar school names by fuzzy keyword matching to ensure that all names are standardized across all files

In [2]:
import pandas as pd
from collections import defaultdict
import csv
import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [3]:
# file names
SCHOOL_FED = "federal_college_data.csv"
SCHOOL_FORBES = "forbes_rankings.csv"
SCHOOL_RATINGS = "myplan_rankings.csv"
CITY_ZIPS = "us_cities_zip_county.csv"
CITY_ZIPS_JSON = "us_cities_zip_county.json"
CITY_CRIME = "us_cities_crime.csv"
UNI_CRIME = "us_universities_crime.csv"
SCHOOLS_MISSING_MATCHES = "schools_missing_matches.csv"

In [4]:
STATE_ABBREVS = {
    "alabama": "AL",
    "alaska": "AK",
    "arizona": "AZ",
    "arkansas": "AR",
    "california": "CA",
    "colorado": "CO",
    "connecticut": "CT",
    "delaware": "DE",
    "florida": "FL",
    "georgia": "GA",
    "hawaii": "HI",
    "idaho": "ID",
    "illinois": "IL",
    "indiana": "IN",
    "iowa": "IA",
    "kansas": "KS",
    "kentucky": "KY",
    "louisiana": "LA",
    "maine": "ME",
    "maryland": "MD",
    "massachusetts": "MA",
    "michigan": "MI",
    "minnesota": "MN",
    "mississippi": "MS",
    "missouri": "MO",
    "montana": "MT",
    "nebraska": "NE",
    "nevada": "NV",
    "new hampshire": "NH",
    "new jersey": "NJ",
    "new mexico": "NM",
    "new york": "NY",
    "north carolina": "NC",
    "north dakota": "ND",
    "ohio": "OH",
    "oklahoma": "OK",
    "oregon": "OR",
    "pennsylvania": "PA",
    "rhode island": "RI",
    "south carolina": "SC",
    "south dakota": "SD",
    "tennessee": "TN",
    "texas": "TX",
    "utah": "UT",
    "vermont": "VT",
    "virginia": "VA",
    "washington": "WA",
    "west virginia": "WV",
    "wisconsin": "WI",
    "wyoming": "WY",
    "district of columbia": "DC"
}

<h4>Lowercase and Strip Key Variables</h4>
School Names, City, Full State Names (not abbreviations)

In [8]:
# lowercase multiple columns to begin the normalization process
def lower_columns(file_name, *column_names):
    file_path = f"../data/{file_name}"
    try:
        df = pd.read_csv(file_path)
        for col in column_names:
            if col in df.columns:
                df[col] = df[col].str.lower().str.strip() # lowercase, remove trailing spaces
            else:
                print(f"Column '{col}' not found in file - data was not lowercased.")
        df.to_csv(file_path, index=False)
        return df[list(column_names)]
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
#lower case school names and city of each file
lower_columns(SCHOOL_FED, "Name", "City")
lower_columns(CITY_CRIME, "State", "City")
lower_columns(UNI_CRIME, "State","School","Campus")
lower_columns(SCHOOL_RATINGS, "school name")
lower_columns(SCHOOL_FORBES, "name")

Unnamed: 0,name
0,princeton university
1,stanford university
2,massachusetts institute of technology
3,yale university
4,"university of california, berkeley"
...,...
495,molloy college
496,"university of colorado, denver"
497,saint mary's university of minnesota
498,thomas jefferson university


<h3>State Name Cleaning</h3>

In [12]:
# convert full state names to abbreviations
def get_state_abbrev(file_name, column_name):
    try:
        file_path = f"../data/{file_name}"
        df = pd.read_csv(file_path)
        
        if column_name in df.columns:
            df[column_name] = df[column_name].replace(STATE_ABBREVS)
            df.to_csv(file_path, index=False)
            return df[column_name]
        else:
            print(f"Column '{column_name}' not found in file - data not replaced.")
            return None
    except KeyError:
        print(f"Error: Column '{column_name}' not found.")
        return None
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except Exception as e:
       print(f"An error occurred: {e}")
       return None

In [13]:
get_state_abbrev(CITY_CRIME, "State")
get_state_abbrev(UNI_CRIME, "State")

0                                                     AL
1                                                     AL
2                                                     AL
3                                                     AL
4                                                     AL
                             ...                        
615                                                   WI
616                                                   WY
617    1 the student enrollment figures provided by t...
618     2 student enrollment figures were not available.
619    note:  caution should be exercised in making a...
Name: State, Length: 620, dtype: object

<h3>Zip Code Cleaning</h3>

In [14]:
# strip 9 digit zip codes to 5 digits
# zip code formats = #####, #####-####, #########
def strip_zip(file_name, column_name):
    try:
        file_path = f"../data/{file_name}"
        df = pd.read_csv(file_path)
        
        if column_name in df.columns:
            df[column_name] = df[column_name].astype(str)
            df[column_name] = df[column_name].str.split("-").str[0] # ZIP format (#####-####)
            df[column_name] = df[column_name].str[:5] # ZIP format (#########)
            df.to_csv(file_path, index=False)
            return df[column_name]
        else:
            raise ValueError(f"Column '{column_name}' not found in CSV.")
    except Exception as e:
        print(f"Error: {e}")
        return None

In [15]:
strip_zip(SCHOOL_FED, "Zip Code")

0       35762
1       35294
2       36117
3       35899
4       36104
        ...  
6424    02840
6425    44333
6426    85326
6427    85027
6428    08055
Name: Zip Code, Length: 6429, dtype: object

<h3>School Name Cleaning</h3>

In [None]:
# rename school name col in all files

# Sample DataFrame
df = pd.DataFrame({'old_col_name': [1, 2, 3], 'B': [4, 5, 6]})

df.rename(columns={'old_col_name': 'new_col_name'}, inplace=True) 

print(df)

In [5]:
# schools in us territories will not be included in analysis
# remove by State FIPS code 
# American Samoa is 60, Guam is 66, Northern Mariana Islands is 69, Puerto Rico is 72, and the U.S. Virgin Islands is 78

def remove_us_terr_schools(filename):
    filepath = f"../data/{filename}"
    df = pd.read_csv(filepath)
    df = df[~df["State FIPS"].isin([60, 66, 69, 72, 78])]
    df.to_csv(filepath, index=False)

remove_us_terr_schools(SCHOOL_FED)

In [10]:
# merge school and campus in UNI_CRIME data

def merge_cols(filename):
    filepath = f"../data/{filename}"
    df = pd.read_csv(filepath)

    # create 'school name', handle missing vals
    df["school name"] = df["School"].fillna('') + ' ' + df["Campus"].fillna('')
    df["school name"] = df["school name"].str.strip()

    df = df.drop(["School", "Campus"], axis=1) # drop old cols

    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index("school name")))
    df = df[cols]

    df.to_csv(filepath, index=False)

merge_cols(UNI_CRIME)

In [None]:
# clean school names (remove and normalize special chars - commas, dashes, parenthesis)
def strip_name(file_name, column_name):
    try:
        file_path = f"../data/{file_name}"
        df = pd.read_csv(file_path)
        
        if column_name in df.columns:
            df[column_name] = df[column_name].astype(str)
            df[column_name] = df[column_name].str.replace("-", " ", regex=False)
            df[column_name] = df[column_name].str.replace(",", "", regex=False)
            df[column_name] = df[column_name].str.replace(":", "", regex=False)
            df[column_name] = df[column_name].str.replace(".", "", regex=False)
            df[column_name] = df[column_name].str.replace("'", "", regex=False)
            df[column_name] = df[column_name].str.replace(" & ", "&", regex=False)
            df[column_name] = df[column_name].str.replace(" at ", " ", regex=False)
            df[column_name] = df[column_name].str.replace("the ", "", regex=False)
            df[column_name] = df[column_name].str.replace(" in ", " ", regex=False)
            df[column_name] = df[column_name].str.replace("2", "", regex=False) # remove subscript
            df[column_name] = df[column_name].str.replace(" ", " ", regex=False)
            df[column_name] = df[column_name].astype(str).str.split("(").str[0]
            df[column_name] = df[column_name].astype(str).str.strip()
            
            df.rename(columns={column_name: 'school'}, inplace=True) # rename col as "school" so all file cols match
            df.to_csv(file_path, index=False)

            return f"School Names have been cleaned for {file_name}"
        else:
            print(f"Column '{column_name}' not found for {file_name}.")
            return None
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except Exception as e:
       print(f"An error occurred: {e}")
       return None

In [None]:
strip_name(SCHOOL_FORBES, "name")
strip_name(SCHOOL_FED, "Name")
strip_name(SCHOOL_RATINGS, "school name")
strip_name(UNI_CRIME, "school name")

In [None]:
# to run after initial strip_name function run, AFTER renaming school col
strip_name(SCHOOL_FORBES, "school")
strip_name(SCHOOL_FED, "school")
strip_name(SCHOOL_RATINGS, "school")
strip_name(UNI_CRIME, "school")

'School Names have been cleaned for us_universities_crime.csv'

<h3>Match School Names</h3>

In [None]:
# audit check to make sure that all school names across three documents are normalized
def check_school_name(school: str, file_list):
    school = school.lower().strip()
    matches = []

    for file in file_list:
        df = pd.read_csv(f"../data/{file}")
        df.columns = df.columns.str.lower().str.strip()

        if "school" not in df.columns:
            print(f"Column 'school' not found in {file}")
            continue

        # standardize column values for matching
        df["school"] = df["school"].astype(str).str.lower().str.strip()

        matched_rows = df[df["school"] == school]

        if not matched_rows.empty:
            matches.append((file, matched_rows["school"].iloc[0]))

    return f"{len(matches)} match for {school} - {matches}" if matches else f"{school} not found in any file."

In [21]:
school_test = ("fitchburg state college" , "university of southern california", 
               "university of minnesota twin cities", "empire beauty school laconia",
               "university of california santa cruz", "franklin w olin college of engineering",
               "columbia university in the city of new york", "columbia university",
               "cuny brooklyn college", "cuny baruch college", "cuny bernard m baruch college"
              )

for test in school_test:
    print(check_school_name(test, [SCHOOL_FED, SCHOOL_FORBES, SCHOOL_RATINGS, UNI_CRIME]))

# based on this test, it was found that the myPlan and federal dataset are aligned in school names
# however, the school names in the forbes dataset does not align to other files

1 match for fitchburg state college - [('myplan_rankings.csv', 'fitchburg state college')]
3 match for university of southern california - [('federal_college_data.csv', 'university of southern california'), ('forbes_rankings.csv', 'university of southern california'), ('myplan_rankings.csv', 'university of southern california')]
4 match for university of minnesota twin cities - [('federal_college_data.csv', 'university of minnesota twin cities'), ('forbes_rankings.csv', 'university of minnesota twin cities'), ('myplan_rankings.csv', 'university of minnesota twin cities'), ('us_universities_crime.csv', 'university of minnesota twin cities')]
1 match for empire beauty school laconia - [('federal_college_data.csv', 'empire beauty school laconia')]
4 match for university of california santa cruz - [('federal_college_data.csv', 'university of california santa cruz'), ('forbes_rankings.csv', 'university of california santa cruz'), ('myplan_rankings.csv', 'university of california santa cruz'

In [None]:
# find schools where matches are not found between different files
# ex - a school is only found in the forbes data, but not in myPlan or federal data
# this will help us look at what is causing school name mismatches


def find_unmatched_schools(file_list):
    school_sources = defaultdict(set)  # maps school name -> set of files it appears in

    # gather normalized school names and file presence
    for file in file_list:
        df = pd.read_csv(f"../data/{file}")
        df.columns = df.columns.str.lower().str.strip()

        if "school" not in df.columns:
            print(f"Column 'school' not found in {file}")
            continue

        # standardize column values for matching
        df["school"] = df["school"].astype(str).str.lower().str.strip()

        for school in df["school"].unique():
            school_sources[school].add(file)

    # find schools not present in other files
    all_files = set(file_list)
    unmatched_schools = []
    unmatched_school_names = []

    for school, files_present in school_sources.items():
        if files_present != all_files:
            unmatched_schools.append((school, list(files_present)))
            unmatched_school_names.append((school))
    print(len(unmatched_school_names))
    return unmatched_school_names

In [41]:
all_school_files = [SCHOOL_FED, SCHOOL_FORBES, SCHOOL_RATINGS, UNI_CRIME]
list_unmatched_schools = find_unmatched_schools(all_school_files)
print(list_unmatched_schools)

6295
['alabama a&m university', 'amridge university', 'alabama state university', 'university of alabama', 'central alabama community college', 'athens state university', 'auburn university montgomery', 'auburn university', 'birmingham southern college', 'chattahoochee valley community college', 'south university montgomery', 'enterprise state community college', 'coastal alabama community college', 'faulkner university', 'gadsden state community college', 'new beginning college of cosmetology', 'george c wallace community college dothan', 'george c wallace state community college hanceville', 'george c wallace state community college selma', 'herzing university birmingham', 'huntingdon college', 'heritage christian university', 'j f drake state community and technical college', 'j f ingram state technical college', 'jacksonville state university', 'jefferson state community college', 'john c calhoun state community college', 'lawson state community college', 'university of west alabam

In [31]:
#download unmatched_schools to a csv to see all school names
def get_schools_missing_matches_csv(filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Schools"])
        for i in find_unmatched_schools(all_school_files):
            writer.writerow([i])
    return f"Schools added to {filename}!"

In [42]:
get_schools_missing_matches_csv(f"../data/{SCHOOLS_MISSING_MATCHES}")

6295


'Schools added to ../data/schools_missing_matches.csv!'

In [None]:
# use fuzzywuzzy to find school matches with similar names based on keywords

def match_by_keyword(filename, similarity):
    df = pd.read_csv(f"../data/{filename}") 
    df["Schools"] = df["Schools"].astype(str).str.strip()

    matches_dict = {}
    visited = set()

    school_names = list(df["Schools"].unique())
    
    for name in school_names:
        if name in visited:
            continue
    
        # RapidFuzz uses `extract` instead of `extractBests`
        similar = process.extract(
            name,
            school_names,
            scorer=fuzz.token_sort_ratio,
            score_cutoff=similarity
        )

        group = [match_name for match_name, score, _ in similar if match_name != name]
        
        if group:
            matches_dict[name] = group
            visited.update(group)
            visited.add(name)

    return matches_dict


In [None]:
fuzzymatch_98 = match_by_keyword(SCHOOLS_MISSING_MATCHES, 98)
print(fuzzymatch_98)

{'california polytechnic state university san luis obispo': ['california state polytechnic university san luis obispo'], 'lyles college of beauty': ['lyles college of  beauty'], 'clayton  state university': ['clayton state university'], 'minnesota state university mankato': ['minnesota state unversity mankato'], 'college of staten island cuny': ['cuny college of staten island'], 'suny purchase college': ['suny college purchase'], 'burlington county institute of technology adult education': ['burlington county institute of technology   adult education'], 'texas barber college   branch campus #1': ['texas barber college   branch campus #'], 'texas barber college   branch campus #5': ['texas barber college   branch campus #']}


In [65]:
# normalize Special Cases in all four files based off of mapping dictionary
# "cuny baruch college", "cuny bernard m baruch college"
# "columbia university in the city of new york", "columbia university"

def match_schools(file_list, mapping_dict: dict) -> None:
    for file in file_list:
        file_path = f"../data/{file}"
        df = pd.read_csv(file_path)
        df["school"] = df["school"].replace(mapping_dict)
        df.to_csv(file_path, index=False)

In [78]:
match_schools(all_school_files, fuzzymatch_98)

# check unmatched schools again using the code above to confirm that the files have been updated 
# repeat fuzzy keyword matching to find if to find if any schools were missed the first time around

In [67]:
get_schools_missing_matches_csv(f"../data/{SCHOOLS_MISSING_MATCHES}")

6286


'Schools added to ../data/schools_missing_matches.csv!'

In [88]:
# review similar school groups and create a mapping list
# convert list of school mismatches that refer to the same school, based on the data above
# then replace school name to one version on all necessary files -- this will standardize school names
school_matches_map = {
    "columbia university in the city of new york": "columbia university",
    "university of north carolina at chapel hill": "university of north carolina chapel hill",
    "university of virginia main campus": "university of virginia",
    "georgia institute of technology main campus": "georgia institute of technology",
    "university of illinois at urbana champaign": "university of illinois urbana champaign",
    "university of washington seattle campus": "university of washington seattle",
    "university of texas at austin": "university of texas austin",
    "college of william and mary": "william&mary",
    "barnard college": "barnard college",
    "california polytechnic state univ san luis obispo": "california polytechnic state university san luis obispo",
    "purdue university main campus": "purdue university",
    "north carolina state university at raleigh": "north carolina state university raleigh",
    "texas a&m university": "texas a&m university college station",
    "cuny bernard m baruch college": "cuny baruch college",
    "suny at binghamton": "binghamton university suny",
    "cooper union": "cooper union",
    "rutgers university new brunswick": "rutgers university",
    "virginia polytechnic institute and state univ": "virginia tech",
    "suny at stony brook": "stony brook university suny",
    "ohio state university main campus": "ohio state university",
    "university of colorado at boulder": "university of colorado boulder",
    "arizona state university main campus": "arizona state university tempe",
    "university of oklahoma norman campus": "university of oklahoma norman",
    "university of houston university park": "university of houston",
    "auburn university main campus": "auburn university",
    "city college": "cuny the city college of new york",
    "the college of new jersey": "college of new jersey",
    "university of texas at dallas": "university of texas dallas",
    "suny at buffalo": "university at buffalo",
    "louisiana state univ and ag and mech college": "louisiana state university",
    "university of tennessee": "university of tennessee knoxville",
    "university of pittsburgh main campus": "university of pittsburgh",
    "tulane university of louisiana": "tulane university",
    "university of kansas main campus": "university of kansas",
    "suny college at geneseo": "suny geneseo",
    "university of arkansas main campus": "university of arkansas",
    "oklahoma state university main campus": "oklahoma state university",
    "university of hawaii at manoa": "university of hawaii manoa",
    "manhattan college": "manhattan university",
    "bentley college": "bentley university",
    "miami university oxford": "miami university",
    "pennsylvania state university penn state main campus": "pennsylvania state university main campus",
    "university of south carolina columbia": "university of south carolina",
    "university of new hampshire main campus": "university of new hampshire",
    "university of cincinnati main campus": "university of cincinnati",
    "university of north dakota main campus": "university of north dakota",
    "university of vermont and state agricultural coll": "university of vermont",
    "texas state university san marcos": "texas state university",
    "university of st thomas": "university of saint thomas",
    "university of tulsa": "university of tulsa",
    "university of missouri rolla": "missouri university of science and technology",
    "suny maritime college": "suny maritime college",
    "university of california merced": "university of california merced",
    "colorado state university": "colorado state university fort collins",
    "suny farmingdale state college": "suny farmingdale state college",
    "california state university maritime academy": "california state university maritime academy",
    "university of puerto rico rio piedras campus": "university of puerto rico rio piedras",
    "pace university new york": "pace university",
    "california state university east bay": "california state university east bay",
    "university of mississippi main campus": "university of mississippi",
    "university of massachusetts lowell": "university of massachusetts lowell",
    "university of the pacific": "university of the pacific",
    "fashion institute of technology": "fashion institute of technology",
    "mary washington college": "university of mary washington",
    "university of new mexico": "university of new mexico",
    "university of puerto rico mayaguez": "university of puerto rico mayaguez",
    "university of florida online": "university of florida online",
    "suny college of environmental science and forestry": "suny college of environmental science and forestry",
    "university of wisconsin river falls": "university of wisconsin river falls",
    "university of maryland university college": "university of maryland global campus",
    "university of colorado denver": "university of colorado denver",
    "university of southern mississippi": "university of southern mississippi",
    "university of toledo": "university of toledo",
    "university of michigan flint": "university of michigan flint",
    "university of tampa": "university of tampa",
    "university of montana missoula": "university of montana missoula",
    "university of wisconsin milwaukee": "university of wisconsin milwaukee",
    "simmons university": "simmons",
    "siena college": "siena",
    "denison university": "denison",
    "drew university": "drew",
    "spelman college": "spelman",
    "howard university": "howard",
    "suny broome community college": "suny binghamton",
    "stony brook university": "cuny stony brook",
    "virginia polytechnic institute and state univ": "virginia polytechnic institute and state university",
    "hobart and william smith colleges": "hobart william smith colleges",
    "suny farmingdale state college": "farmingdale state college",
    "concordia college—moorhead": "concordia college moorhead",
    "st edwards university": "saint edwards university",
    "university of missouri saint louis": "university of missouri st louis",
    "jefferson state community college": "jefferson community college",
    "iowa western community college": "western iowa tech community college",
    "strayer university   northwest houston campus": "strayer university northwest houston",
    "texas barber college": "texas barber college   branch campus #",
    "texas barber college": "south texas barber college inc",
    'university of arkansas for medical sciences': 'university of arkansas medical sciences',
    'central college': 'centra college',
    'cloyds beauty school 1 inc': 'cloyds beauty school 3 inc',
    'american beauty school': 'american beauty schools',
    'community college of allegheny county': 'allegheny county community college',
    'academy for salon professionals': 'academy of salon professionals',
    'shepherds college': 'shepherd college', 
    'academy of professional cosmetology': 'professional cosmetology academy',
    'tricoci university of beauty culture chicago nw': 'tricoci university of beauty culture chicago ne'
}

In [89]:
match_schools(all_school_files, school_matches_map)
get_schools_missing_matches_csv(f"../data/{SCHOOLS_MISSING_MATCHES}")

6204


'Schools added to ../data/schools_missing_matches.csv!'

In [90]:
fuzzymatch_96 = match_by_keyword(SCHOOLS_MISSING_MATCHES, 96)
print(fuzzymatch_96)

{'lyon college': ['elyon college'], 'northwest technical institute': ['northeast technical institute'], 'bryan university': ['bryant university'], 'laney college': ['lane college'], 'southwestern illinois college': ['southeastern illinois college'], 'northeast iowa community college': ['northwest iowa community college'], 'southeastern community college': ['southwestern community college'], 'northeast mississippi community college': ['northwest mississippi community college'], 'southeast missouri state university': ['southwest missouri state university'], 'martin community college': ['marin community college'], 'southeastern baptist theological seminary': ['southwestern baptist theological seminary'], 'northwest state community college': ['northeast state community college'], 'southeastern oklahoma state university': ['southwestern oklahoma state university'], 'mount marty university': ['mount mary university'], 'southeast technical college': ['southwest technical college'], 'king univ