#### Prototype FuzzyPandas Functions using U.S. News and IPEDS Data

In [5]:
import time
import functools
import numpy as np
import pandas as pd

import difflib

from collections import defaultdict
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

**First, let's read in the school ranking data scraped from usnews.com & do a little pre-processing**

In [2]:
def read_us_news(pickle):
    df = pd.read_pickle(pickle)   
    return pd.concat([us_news, df], axis=0, ignore_index=True)

us_news = pd.DataFrame() #initialize empty data frame
us_news = read_us_news("example_data/us_news/usnews-ranking-national-universities.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-national-liberal-arts-colleges.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-west.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-west.pickle")

us_news["city"] = us_news.location.apply(lambda x: x.split(", ")[0].upper().strip()) #parse out city, to upper case
us_news["state"] = us_news.location.apply(lambda x: x.split(", ")[1].upper().strip()) #parse out state, to upper case
us_news["school"] = us_news.school.apply(lambda x: x.upper().strip()) #school name to upper case

us_news.drop("location", axis=1, inplace=True) #drop original location (now that we've split out into city and state)
us_news.head(5)

Unnamed: 0,category,school,score,city,state
0,National Universities,PRINCETON UNIVERSITY,100 out of 100.,PRINCETON,NJ
1,National Universities,HARVARD UNIVERSITY,99 out of 100.,CAMBRIDGE,MA
2,National Universities,YALE UNIVERSITY,97 out of 100.,NEW HAVEN,CT
3,National Universities,COLUMBIA UNIVERSITY,95 out of 100.,NEW YORK,NY
4,National Universities,STANFORD UNIVERSITY,95 out of 100.,STANFORD,CA


**Read in IPEDS Data & Preprocess. IPEDS Data can be downloaded for free here:**
https://nces.ed.gov/ipeds/datacenter/Default.aspx

In [3]:
ipeds=pd.read_csv("example_data/ipeds/HD2014.csv")[["UNITID","INSTNM","CITY","STABBR"]]

ipeds.rename(columns={"UNITID":"unitid","INSTNM":"school","CITY":"city","STABBR":"state"},inplace=True) 
#rename columns to match U.S. News Data

ipeds["school"] = ipeds.school.apply(lambda x: x.upper().strip()) #school to upper case
ipeds["city"] = ipeds.city.apply(lambda x: x.upper().strip()) #city to upper case
ipeds["state"] = ipeds.state.apply(lambda x: x.upper().strip()) #state to upper case
ipeds.head(5)

Unnamed: 0,unitid,school,city,state
0,100636,COMMUNITY COLLEGE OF THE AIR FORCE,MONTGOMERY,AL
1,100654,ALABAMA A & M UNIVERSITY,NORMAL,AL
2,100663,UNIVERSITY OF ALABAMA AT BIRMINGHAM,BIRMINGHAM,AL
3,100690,AMRIDGE UNIVERSITY,MONTGOMERY,AL
4,100706,UNIVERSITY OF ALABAMA IN HUNTSVILLE,HUNTSVILLE,AL


** Instead, let's use FuzzyPandas to match each school in the U.S. News Data to it's CLOSEST, but not necessarily EXACT, U.S. News Match**

In [4]:
test_us_news = us_news[us_news.state.isin(["CT","NY"])].copy()
test_ipeds = ipeds[ipeds.state.isin(["CT","NY"])].copy()

print us_news.shape, test_us_news.shape
print ipeds.shape, test_ipeds.shape

(1506, 5) (142, 5)
(7687, 4) (574, 4)


In [63]:
def test_times(scorer, name):
    start = time.time()
    
    print name
    print "----"
    print scorer("Columbia University", "Columbia University in the City of New York")
    end = time.time()
    print "Duration: ", (end-start)*10000, "seconds."
    print " "
    
print test_times(fuzz.ratio, "Ratio")
print test_times(fuzz.partial_ratio ,"Partial Ratio")
print test_times(fuzz.token_sort_ratio, "Token Sort")
print test_times(fuzz.token_set_ratio, "Token Set")
print test_times(fuzz.partial_token_sort_ratio, "Partial Token Sort")
print test_times(fuzz.partial_token_set_ratio, "Partial Token Set")

Ratio
----
61
Duration:  1.2993812561 seconds.
 
None
Partial Ratio
----
100
Duration:  1.44958496094 seconds.
 
None
Token Sort
----
61
Duration:  1.34944915771 seconds.
 
None
Token Set
----
100
Duration:  1.79052352905 seconds.
 
None
Partial Token Sort
----
63
Duration:  1.49011611938 seconds.
 
None
Partial Token Set
----
100
Duration:  1.74999237061 seconds.
 
None


In [69]:
def fuzzy_match_1(a, b, byvar, cutoff):
    start = time.time()
    a["matched"] = a[byvar].map(lambda x: difflib.get_close_matches(x, b[byvar].unique(), n=1, cutoff=cutoff))
    end = time.time()
    print "Duration: ", end-start, "Seconds"
    return a

fuzzy_match_1(test_us_news, test_ipeds, "school", cutoff=0.6).head(5)

Duration:  4.98256492615 Seconds


Unnamed: 0,category,school,score,city,state,constant,filter,matched
2,National Universities,YALE UNIVERSITY,97 out of 100.,NEW HAVEN,CT,X,X,[YALE UNIVERSITY]
3,National Universities,COLUMBIA UNIVERSITY,95 out of 100.,NEW YORK,NY,X,X,[COLGATE UNIVERSITY]
14,National Universities,CORNELL UNIVERSITY,84 out of 100.,ITHACA,NY,X,X,[CORNELL UNIVERSITY]
31,National Universities,NEW YORK UNIVERSITY,67 out of 100.,NEW YORK,NY,X,X,[NEW YORK UNIVERSITY]
32,National Universities,UNIVERSITY OF ROCHESTER,66 out of 100.,ROCHESTER,NY,X,X,[UNIVERSITY OF ROCHESTER]


In [71]:
def fuzzy_match_2(a, b, byvar, cutoff):
    start = time.time()
    a["matched"] = a[byvar].map(lambda x: process.extractOne(x, b[byvar].unique(), scorer=fuzz.WRatio))
    end = time.time()
    print "Duration: ", end-start, "Seconds."
    return a

fuzzy_match_2(test_us_news, test_ipeds, "school", cutoff=0.6).head(5)

Duration:  15.8899931908 Seconds.


Unnamed: 0,category,school,score,city,state,constant,filter,matched
2,National Universities,YALE UNIVERSITY,97 out of 100.,NEW HAVEN,CT,X,X,"(YALE UNIVERSITY, 100)"
3,National Universities,COLUMBIA UNIVERSITY,95 out of 100.,NEW YORK,NY,X,X,"(COLUMBIA UNIVERSITY IN THE CITY OF NEW YORK, 90)"
14,National Universities,CORNELL UNIVERSITY,84 out of 100.,ITHACA,NY,X,X,"(CORNELL UNIVERSITY, 100)"
31,National Universities,NEW YORK UNIVERSITY,67 out of 100.,NEW YORK,NY,X,X,"(NEW YORK UNIVERSITY, 100)"
32,National Universities,UNIVERSITY OF ROCHESTER,66 out of 100.,ROCHESTER,NY,X,X,"(UNIVERSITY OF ROCHESTER, 100)"


In [89]:
##Function to Run Fuzzy Match
def get_fuzzy_match(a, b, byvars, cutoff=0.6, scorer=fuzz.token_sort_ratio):
    start = time.time()
    a = a.drop_duplicates(subset=byvars)[byvars].T.to_dict().values() #DF -> List of Dicts
    b = b.drop_duplicates(subset=byvars)[byvars].T.to_dict().values() #DF -> List of Dicts
    
    for byvar in byvars[:1]:
        for row in a: #iterate through rows (list of dicts)
            max_score = 0
            match = np.nan
            
            for choice in b: #iterate through choices (simple list)
                print row[byvar], choice[byvar]
                score = scorer(row[byvar], choice[byvar])
                
                if score > max_score:
                    if score > cutoff:
                        row["matched_"+byvar] = choice[byvar]
                    else:
                        row["matched_"+byvar] = np.nan
                    
    end = time.time()
    print "Duration: ", end-start, "seconds."
    #return pd.DataFrame(a)

In [None]:
get_fuzzy_match(test_us_news, test_ipeds, ["school","city"])

YALE UNIVERSITY EXCELSIOR COLLEGE
YALE UNIVERSITY UNIVERSITY OF PHOENIX-CONNECTICUT
YALE UNIVERSITY ULSTER COUNTY COMMUNITY COLLEGE
YALE UNIVERSITY BRANFORD HALL CAREER INSTITUTE-ALBANY CAMPUS
YALE UNIVERSITY SANFORD-BROWN INSTITUTE-NEW YORK
YALE UNIVERSITY UNION COLLEGE
YALE UNIVERSITY UNION THEOLOGICAL SEMINARY IN THE CITY OF NEW YORK
YALE UNIVERSITY UNITED TALMUDICAL SEMINARY
YALE UNIVERSITY BRITTANY BEAUTY SCHOOL
YALE UNIVERSITY LINCOLN TECHNICAL INSTITUTE-WHITESTONE
YALE UNIVERSITY UNITED STATES MERCHANT MARINE ACADEMY
YALE UNIVERSITY UNITED STATES MILITARY ACADEMY
YALE UNIVERSITY ACADEMY DI CAPELLI-SCHOOL OF COSMETOLOGY
YALE UNIVERSITY UTICA COLLEGE
YALE UNIVERSITY UTICA SCHOOL OF COMMERCE
YALE UNIVERSITY CENTER FOR NATURAL WELLNESS SCHOOL OF MASSAGE THERAPY
YALE UNIVERSITY ACE COMPUTER TRAINING CENTER
YALE UNIVERSITY NEW AGE TRAINING
YALE UNIVERSITY VASSAR COLLEGE
YALE UNIVERSITY VILLA MARIA COLLEGE
YALE UNIVERSITY SCHOOL OF VISUAL ARTS
YALE UNIVERSITY WAGNER COLLEGE
YALE UNIVER

In [52]:
##Function to Crate Crosswalk
def create_xwalk(a, b, byvars, cutoffs, scorer):
    b["filter"] = b[["constant"]+byvars].apply(lambda x: "#".join(x), axis=1).str.split("#")
    
    a = a.drop_duplicates(subset=byvars)[byvars].T.to_dict().values()
    b = b.drop_duplicates(subset=byvars)[byvars+["filter"]].T.to_dict().values()
    
    for row in a:
        for i in range(0, len(byvars)):
            byvar = byvars[i]
            cutoff = cutoffs[i]
            if i==0:
                choices = [x[byvar] for x in b]
            else:
                choices = [x[byvar] for x in b if row["filter"]==x["filter"][:i]]
            
            row["matched_"+byvar] = fuzzy_match(row[byvar], choices, cutoff=cutoff, scorer=scorer)[1]
            row["filter"] = row["filter"]+[row["matched_"+byvar]]
            
    return pd.DataFrame(a).drop(["constant","filter"], axis=1)

In [53]:
start = time.time()
test = create_xwalk(test_us_news, test_ipeds, byvars=["state","city","school"], cutoffs=[0.6, 0.6, 0.6],\
                    scorer=fuzz.ratio)
end = time.time()
print "Duration: ", end-start, "seconds."

test.head(5)

ValueError: Length of values does not match length of index

In [None]:
"""
Function to Impliment Fuzzy Merge
a: Dataframe A
b: Dataframe B
byvars: By-Variable or List of By-variables; Similar to "on" in pandas merge and join
cutoffs: Match Similarity Cutoffs. Order Corresponds to By-Variable Order. Length must match number of by-variables
how: {"left", "right", "outer", "inner"}. Default left
method: {"ratio","partial ratio","sort ratio","sort partial ratio","set ratio","set partial ratio"}. Default ratio
"""

-

#### TO DO:
1. Add Single Variable Case to Crosswalk Creator
2. Review how ratios calculated in fuzzywuzzy and add to crosswalk creator without dramatically increasing runtime
3. Polish up code

#### Old Work...

In [None]:
##Function to Do a Fuzzy Match
##This on takes way to long to run; need to find a way to make more efficient
def fuzzy_merge(a, b, fuzz_on, how="left", score_cutoff=0.6):
    merged = exact_merge(a=a, b=b, exact_on=fuzz_on) #Run Exact Merge
    matched = merged[0] #Pull Out matches
    nomatch = merged[1] #Pull Out Nonmatched
    
    ##If No Nonmatched, we're done :). Just Return Matched Dataframe
    if nomatch.shape[0] == 0: #if no non-matches, just return the merged dataframe
        return matched
    
    ##Otherwise, Proceed to Fuzzy Matching Non-Matches
    else:
        nomatch_a = nomatch[nomatch.in_a==1].copy()
        nomatch_b = nomatch[nomatch.in_b==1].copy()
        
        ##If Multiple By-Variables, String them Together for Fuzzy Merge. Otherwise, Use Single By-Varaiable
        if len(fuzz_on) > 1:
            nomatch_a["byvar"] = nomatch_a[fuzz_on].apply(lambda x: " ".join(x), axis=1)
            nomatch_b["byvar"] = nomatch_b[fuzz_on].apply(lambda x: " ".join(x), axis=1)
        else:
            nomatch_a["byvar"] = nomatch_a[fuzz_on]
            nomatch_b["byvar"] = nomatch_b[fuzz_on]
        
        ##For Each Nonmatch in Dataframe A, Use FuzzyWuzzy to Match to Closest Dataframe B Nonmatched
        fuzzy_matches=[]
        for each in nomatch_a.byvar.unique():
            fuzzy_match = process.extractOne(each, nomatch_b.byvar.unique(), score_cutoff=score_cutoff)
            
            if fuzzy_match==None:
                fuzzy_matches.append({"byvar": each, "matchvar": np.nan, "fuzzy_match_score": np.nan})
            else:
                fuzzy_matches.append({"byvar": each, "matchvar": fuzzy_match[0], "fuzzy_match_score": fuzzy_match[1]})
                
        fuzzy_matches = pd.DataFrame(fuzzy_matches)
        
        ##Merge Dataframe A Nonmatches to Dataframe B Nonmatches. Append to Exact Matches and Return
        nomatch_b.rename(columns={"byvar":"matchvar"})
        fuzzy_merge1 = pd.merge(nomatch_a, fuzzy_matches, on="byvar", how="left")
        fuzzy_merge2 = pd.merge(nomatch_a, nomatch_a, on="matchvar", how=how)
        
        return pd.concat([matched, fuzzy_merge2], axis=0).drop(["byvar", "merged"])