#### Prototype FuzzyPandas Functions using U.S. News and IPEDS Data

In [1]:
import time
import numpy as np
import pandas as pd
import difflib

**First, let's read in the school ranking data scraped from usnews.com & do a little pre-processing**

In [2]:
def read_us_news(pickle):
    df = pd.read_pickle(pickle)   
    return pd.concat([us_news, df], axis=0, ignore_index=True)

us_news = pd.DataFrame() #initialize empty data frame
us_news = read_us_news("example_data/us_news/usnews-ranking-national-universities.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-national-liberal-arts-colleges.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-west.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-west.pickle")

us_news["city"] = us_news.location.apply(lambda x: x.split(", ")[0].upper().strip()) #parse out city, to upper case
us_news["state"] = us_news.location.apply(lambda x: x.split(", ")[1].upper().strip()) #parse out state, to upper case
us_news["school"] = us_news.school.apply(lambda x: x.upper().strip()) #school name to upper case

us_news.drop("location", axis=1, inplace=True) #drop original location (now that we've split out into city and state)
us_news.head(5)

Unnamed: 0,category,school,score,city,state
0,National Universities,PRINCETON UNIVERSITY,100 out of 100.,PRINCETON,NJ
1,National Universities,HARVARD UNIVERSITY,99 out of 100.,CAMBRIDGE,MA
2,National Universities,YALE UNIVERSITY,97 out of 100.,NEW HAVEN,CT
3,National Universities,COLUMBIA UNIVERSITY,95 out of 100.,NEW YORK,NY
4,National Universities,STANFORD UNIVERSITY,95 out of 100.,STANFORD,CA


**Read in IPEDS Data & Preprocess. IPEDS Data can be downloaded for free here:**
https://nces.ed.gov/ipeds/datacenter/Default.aspx

In [3]:
ipeds=pd.read_csv("example_data/ipeds/HD2014.csv")[["UNITID","INSTNM","CITY","STABBR"]]

ipeds.rename(columns={"UNITID":"unitid","INSTNM":"school","CITY":"city","STABBR":"state"},inplace=True) 
#rename columns to match U.S. News Data

ipeds["school"] = ipeds.school.apply(lambda x: x.upper().strip()) #school to upper case
ipeds["city"] = ipeds.city.apply(lambda x: x.upper().strip()) #city to upper case
ipeds["state"] = ipeds.state.apply(lambda x: x.upper().strip()) #state to upper case
ipeds.head(5)

Unnamed: 0,unitid,school,city,state
0,100636,COMMUNITY COLLEGE OF THE AIR FORCE,MONTGOMERY,AL
1,100654,ALABAMA A & M UNIVERSITY,NORMAL,AL
2,100663,UNIVERSITY OF ALABAMA AT BIRMINGHAM,BIRMINGHAM,AL
3,100690,AMRIDGE UNIVERSITY,MONTGOMERY,AL
4,100706,UNIVERSITY OF ALABAMA IN HUNTSVILLE,HUNTSVILLE,AL


**Pull Out a Sample of Schools from CT and NY**

In [4]:
samp_us_news = us_news[us_news.state.isin(["CT","NY"])].copy()
samp_ipeds = ipeds[ipeds.state.isin(["CT","NY"])].copy()

print samp_us_news.shape
print samp_ipeds.shape

(142, 5)
(574, 4)


#### Fuzzy Matching Helper Functions

In [5]:
def ratio_match(a, b, key, cutoff):
    keyA = key+"_A"
    keyB = key+"_B"
    
    new_a = pd.DataFrame(a.rename(columns = {key : keyA})[keyA].copy())
    new_a[keyB] = new_a[keyA].map(lambda x: difflib.get_close_matches(x, b[key].unique(), n=1, cutoff=cutoff)).str.get(0)

    return new_a    

In [6]:
def sorted_ratio_match(a, b, key, cutoff):
    keyA = key+"_A"
    keyB = key+"_B"
    keyASort = key+"_A_Sorted"
    keyBSort = key+"_B_Sorted"
    
    new_a = pd.DataFrame(a.rename(columns = {key : keyA})[keyA].copy())
    new_b = pd.DataFrame(b.rename(columns = {key : keyB})[keyB].copy())
    
    new_a[keyASort] = new_a[keyA].map(lambda x: " ".join(sorted(x.split())))
    new_b[keyBSort] = new_b[keyB].map(lambda x: " ".join(sorted(x.split())))
    
    new_a[keyBSort] = new_a[keyASort].map(lambda x: difflib.get_close_matches(x, new_b[keyBSort].unique(), n=1,\
                                                                              cutoff=cutoff)).str.get(0)
    
    return pd.merge(new_a, new_b, on=keyBSort, how="left").drop([keyASort, keyBSort], axis=1)

In [7]:
def fuzzy_merge(a, b, key, cutoff=0.6, how="left", match_function=ratio_match):
    a["in_a"] = 1 #Flag rows existing in DF A
    b["in_b"] = 1 #Flag rows existing in DF B
    
    merged = pd.merge(a, b, on=key, how="outer") #Perform a regular (exact) join
    
    matched = merged[(merged.in_a.notnull()) & (merged.in_b.notnull())].copy() #Sift out exact matches
    nomatch_a = merged[(merged.in_a.notnull() | (merged.in_b.isnull()))].copy() #Sift out nonmatches from DF A
    nomatch_b = merged[(merged.in_a.isnull()) | (merged.in_b.notnull())].copy() #Sift out nomathches from DF B
    
    if nomatch_a.shape[0]==0 | nomatch_b.shape[0]==0: #If no nonmatches in A or B, return matches & end
        return matched

    else: #Otherwise, proceed to fuzzy matching. Create crosswalk key file
        crosswalk = match_function(nonmatch_a, nonmatch_b, key=key, cutoff=cutoff) #Create key crosswalk
        
        fuzzy_merge1 = pd.merge(nonmatch_a, crosswalk, on=key+"_A", how="left") #Match DF A to crosswalk on Key 
        fuzzy_merge2 = pd.merge(fuzzy_merge1, nonmatch_b, on=key+"_B", how=how) #Match DF A+Xwalk to DF B on Key B

        return pd.concat([matched, fuzzy_merge_2], axis=0) #Append fuzzy matches to exact matches and return (MAKE DROPS)

In [8]:
def print_fuzzy(df, keyA, keyB):
    print df[df[keyA] != df[keyB]][[keyA, keyB]]

In [10]:
start = time.time()
test = ratio_match(us_news, ipeds, key="school", cutoff=0.6)
end = time.time()

print "Duration: ", end-start, "Seconds"
print test.columns
test.head(15)

KeyboardInterrupt: 

In [None]:
##Test Fuzzy Match
merged = fuzzy_merge()