#### Prototype FuzzyPandas Functions using U.S. News and IPEDS Data

In [1]:
import time
import functools
import numpy as np
import pandas as pd

import difflib
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

**First, let's read in the school ranking data scraped from usnews.com & do a little pre-processing**

In [2]:
def read_us_news(pickle):
    df = pd.read_pickle(pickle)   
    return pd.concat([us_news, df], axis=0, ignore_index=True)

us_news = pd.DataFrame() #initialize empty data frame
us_news = read_us_news("example_data/us_news/usnews-ranking-national-universities.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-national-liberal-arts-colleges.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-west.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-west.pickle")

us_news["city"] = us_news.location.apply(lambda x: x.split(", ")[0].upper().strip()) #parse out city, to upper case
us_news["state"] = us_news.location.apply(lambda x: x.split(", ")[1].upper().strip()) #parse out state, to upper case
us_news["school"] = us_news.school.apply(lambda x: x.upper().strip()) #school name to upper case

us_news.drop("location", axis=1, inplace=True) #drop original location (now that we've split out into city and state)
us_news.head(5)

Unnamed: 0,category,school,score,city,state
0,National Universities,PRINCETON UNIVERSITY,100 out of 100.,PRINCETON,NJ
1,National Universities,HARVARD UNIVERSITY,99 out of 100.,CAMBRIDGE,MA
2,National Universities,YALE UNIVERSITY,97 out of 100.,NEW HAVEN,CT
3,National Universities,COLUMBIA UNIVERSITY,95 out of 100.,NEW YORK,NY
4,National Universities,STANFORD UNIVERSITY,95 out of 100.,STANFORD,CA


**Read in IPEDS Data & Preprocess. IPEDS Data can be downloaded for free here:**
https://nces.ed.gov/ipeds/datacenter/Default.aspx

In [3]:
ipeds=pd.read_csv("example_data/ipeds/HD2014.csv")[["UNITID","INSTNM","CITY","STABBR"]]

ipeds.rename(columns={"UNITID":"unitid","INSTNM":"school","CITY":"city","STABBR":"state"},inplace=True) 
#rename columns to match U.S. News Data

ipeds["school"] = ipeds.school.apply(lambda x: x.upper().strip()) #school to upper case
ipeds["city"] = ipeds.city.apply(lambda x: x.upper().strip()) #city to upper case
ipeds["state"] = ipeds.state.apply(lambda x: x.upper().strip()) #state to upper case
ipeds.head(5)

Unnamed: 0,unitid,school,city,state
0,100636,COMMUNITY COLLEGE OF THE AIR FORCE,MONTGOMERY,AL
1,100654,ALABAMA A & M UNIVERSITY,NORMAL,AL
2,100663,UNIVERSITY OF ALABAMA AT BIRMINGHAM,BIRMINGHAM,AL
3,100690,AMRIDGE UNIVERSITY,MONTGOMERY,AL
4,100706,UNIVERSITY OF ALABAMA IN HUNTSVILLE,HUNTSVILLE,AL


** Instead, let's use FuzzyPandas to match each school in the U.S. News Data to it's CLOSEST, but not necessarily EXACT, U.S. News Match**

In [4]:
test_us_news = us_news[us_news.state.isin(["CT","NY"])].copy()
test_ipeds = ipeds[ipeds.state.isin(["CT","NY"])].copy()

print us_news.shape, test_us_news.shape
print ipeds.shape, test_ipeds.shape

print test_us_news.columns

(1506, 5) (142, 5)
(7687, 4) (574, 4)
Index([u'category', u'school', u'score', u'city', u'state'], dtype='object')


In [5]:
def get_matches(a, b, byvar):
    start = time.time()
    
    matches = pd.DataFrame(a[byvar].unique())
    matches.rename(columns={0:byvar}, inplace=True)
    try:
        matches["matched"] = matches[byvar].map(lambda x: difflib.get_close_matches(x, b[byvar].unique(), n=1, cutoff=0.6))
    except IndexError:
        matches["matched"] = np.nan
    
    end = time.time()
    print "Duration: ", end-start, "Seconds."
    
    return matches

In [None]:
def create_xwalk(a, b, byvars): 
    start=time.time()
    a = a.drop_duplicates(subset=byvars)[byvars].T.to_dict().values()
    b = b.drop_duplicates(subset=byvars)[byvars].T.to_dict().values()

    for i, row in enumerate(a):
        states = [x["state"] for x in b]
        row["matched_state"] = difflib.get_close_matches(row["state"], states, n=1, cutoff=0.6)[0]
        
        cities = [x["city"] for x in b if x["state"]==row["matched_state"]]
        row["matched_city"] = difflib.get_close_matches(row["city"], cities, n=1, cutoff=0.6)[0]
        
        schools = [x["school"] for x in b if x["state"]==row["matched_state"] and x["city"]==row["matched_city"]]
        row["matched_school"] = difflib.get_close_matches(row["school"], schools, n=1, cutoff=0.6)
            
    end = time.time()
    print "Duration: ",end-start, "Seconds."
    
    return a

In [10]:
def create_xwalk(a, b, byvars): 
    start=time.time()
    
    a["one"] = "1"
    b["one"] = "1"

    a["filter"] = a["one"]
    b["filter"] = b[["one"]+byvars].apply(lambda x: "/#/".join(x), axis=1).str.split("/#/")
    
    #a = a[byvars+["filter"]].T.to_dict.values()
    #b = b[byvars+["filter"]].T.to_dict.values()

    return a, b

test = create_xwalk(test_us_news, test_ipeds, ["state","city","school"])
print test[0].head(5)
print ""
print test[1].head(5)

ValueError: Length of values does not match length of index

In [None]:
    
    
    
    a["filter"] = "0" #initialize filter
    b["filter"] = "0" #initialize filter
    
    a = a.drop_duplicates(subset=byvars)[byvars+["filter"]].T.to_dict.values()
    b = b.drop_duplicates(subset=byvars)[byvars+["filter"]]
    
    for byvar, cutoff in zip(byvar, cutoffs):   
        for row in a:
            possible_matches = [x[byvar] for x in b if x["filter"]==row["filter"]]
            
            try:
                row["matched_"+byvar] = difflib.get_close_matches(row[byvar], possible_matches, n=1, cutoff=cutoff)[0]
            except IndexError:
                row["matched_"+byvar] = " "
                
        possible_matches = 
    
    
    
    
    
    
    for row in a[:4]:
        for byvar, cutoff in zip(byvars, cutoffs):            
            possible_matches = [x[byvar] for x in b if x["filter"]==row["filter"]]
 

            
            row["filter"] = row["filter"]+"-"+row["matched_"+byvar]
            for item in b:
                item["filter"] = item["filter"]+"-"+item[byvar] #make this less brute force later?
                
            print row

In [None]:
test_us_news.head(5)

In [None]:
##Function only implimented if multiple byvars / heirarchical problem
##Byvars = 
def create_xwalk(a, b, byvars, cutoffs):
    xwalk = get_matches(a=a, b=b, byvar=byvars[0], cutoff=cutoff[0])
    
    for states in zip(xwalk["state_A"], xwalk["state_B"]):
        slice_a
        slice_b
        xwalk = get_matches 
        
        
        

In [None]:
def create_xwalk():
    for 

In [None]:
def fuzzy_merge(a, b, fuzz_on, how="left", cutoffs=0.6):
    ##First Do a Regular Join
    a["in_a"] = 1
    b["in_b"] = 1
    
    merged = pd.merge(a, b, on=fuzz_on, how="outer")

    matched = merged[(merged.in_a.notnull()) & (merged.in_b.notnull())].copy()
    matched["match_score"] = 1.0 #Score for all exact matches is 1.0 by definition
    
    nomatch_a = merged[(merged.in_a.notnull() | (merged.in_b.isnull()))].copy()
    nomatch_b = merged[(merged.in_a.isnull()) | (merged.in_b.notnull())].copy()
    
    ##If no nomatched, we're done - return dataframe
    if nomatch_a.shape[0]==0 | nomatch_b.shape[0]==0:
        return matched
    
    ##Otherwise, Proceed to fuzzy matching. Fuzzy Merge on first by-variable, than second, etc.
    else:
        fuzzy_matches = 
        
        fuzzy_matches = pd.DataFrame() #initialize empty dataframe to hold matches
        
        i = 0 #Set Number of Iterations to 0
        slice_a = nomatch_a.copy()
        slice_b = nomatch_b.copy()
        
        while i < len(fuzz_on):
            byvar = fuzz_on[i]
            cutoff = cutoffs[i]
            
            xwalk = create_xwalk(slice_a, slice_b, byvar=fuzz_on[i], cutoff=cutoffs[i])
            
            for 
            
            list_a = slice_a[byvar].unique().tolist()
            list_b = slice_a[byvar].unique().tolist()
            
            crosswalk = 
            
            i+=1
        
        
        ##Merge Dataframe A Nonmatches to Dataframe B Nonmatches. Append to Exact Matches and Return
        nomatch_b.rename(columns={"byvar":"matchvar"})
        fuzzy_merge1 = pd.merge(nomatch_a, fuzzy_matches, on="byvar", how="left")
        fuzzy_merge2 = pd.merge(nomatch_a, nomatch_a, on="matchvar", how=how)
        
        return pd.concat([matched, fuzzy_merge2], axis=0).drop(["byvar", "merged"])
    
    
    
    
    
    

In [None]:
print test

In [None]:
In [23]: import difflib 

In [24]: difflib.get_close_matches
Out[24]: <function difflib.get_close_matches>

In [25]: df2.index = df2.index.map(lambda x: difflib.get_close_matches(x, df1.index)[0])

In [26]: df2
Out[26]: 
      letter
one        a
two        b
three      c
four       d
five       e

In [31]: df1.join(df2)
Out[31]: 
       number letter
one         1      a
two         2      b
three       3      c
four        4      d
five        5      e

In [None]:
start = time.time()

test = nyc_us_news["school"].apply(find_match(b=nyc_ipeds.school.unique(), score_cutoff=0.6))

end = time.time()
print end-start
test


In [None]:
        fuzzy_matches=[]
        for each in nomatch_a.byvar.unique():
            fuzzy_match = process.extractOne(each, nomatch_b.byvar.unique(), score_cutoff=score_cutoff)
            
            if fuzzy_match==None:
                fuzzy_matches.append({"byvar": each, "matchvar": np.nan, "fuzzy_match_score": np.nan})
            else:
                fuzzy_matches.append({"byvar": each, "matchvar": fuzzy_match[0], "fuzzy_match_score": fuzzy_match[1]})
                
        fuzzy_matches = pd.DataFrame(fuzzy_matches)

In [None]:
##Function to Do a Fuzzy Match
##This on takes way to long to run; need to find a way to make more efficient
def fuzzy_merge(a, b, fuzz_on, how="left", score_cutoff=0.6):
    merged = exact_merge(a=a, b=b, exact_on=fuzz_on) #Run Exact Merge
    matched = merged[0] #Pull Out matches
    nomatch = merged[1] #Pull Out Nonmatched
    
    ##If No Nonmatched, we're done :). Just Return Matched Dataframe
    if nomatch.shape[0] == 0: #if no non-matches, just return the merged dataframe
        return matched
    
    ##Otherwise, Proceed to Fuzzy Matching Non-Matches
    else:
        nomatch_a = nomatch[nomatch.in_a==1].copy()
        nomatch_b = nomatch[nomatch.in_b==1].copy()
        
        ##If Multiple By-Variables, String them Together for Fuzzy Merge. Otherwise, Use Single By-Varaiable
        if len(fuzz_on) > 1:
            nomatch_a["byvar"] = nomatch_a[fuzz_on].apply(lambda x: " ".join(x), axis=1)
            nomatch_b["byvar"] = nomatch_b[fuzz_on].apply(lambda x: " ".join(x), axis=1)
        else:
            nomatch_a["byvar"] = nomatch_a[fuzz_on]
            nomatch_b["byvar"] = nomatch_b[fuzz_on]
        
        ##For Each Nonmatch in Dataframe A, Use FuzzyWuzzy to Match to Closest Dataframe B Nonmatched
        fuzzy_matches=[]
        for each in nomatch_a.byvar.unique():
            fuzzy_match = process.extractOne(each, nomatch_b.byvar.unique(), score_cutoff=score_cutoff)
            
            if fuzzy_match==None:
                fuzzy_matches.append({"byvar": each, "matchvar": np.nan, "fuzzy_match_score": np.nan})
            else:
                fuzzy_matches.append({"byvar": each, "matchvar": fuzzy_match[0], "fuzzy_match_score": fuzzy_match[1]})
                
        fuzzy_matches = pd.DataFrame(fuzzy_matches)
        
        ##Merge Dataframe A Nonmatches to Dataframe B Nonmatches. Append to Exact Matches and Return
        nomatch_b.rename(columns={"byvar":"matchvar"})
        fuzzy_merge1 = pd.merge(nomatch_a, fuzzy_matches, on="byvar", how="left")
        fuzzy_merge2 = pd.merge(nomatch_a, nomatch_a, on="matchvar", how=how)
        
        return pd.concat([matched, fuzzy_merge2], axis=0).drop(["byvar", "merged"])

In [None]:
ny_us_news = us_news[us_news.state=="NY"].copy()
ny_ipeds = ipeds[ipeds.state=="NY"].copy()

print ny_us_news.shape, ny_ipeds.shape

In [None]:
ny_test = fuzzy_merge(us_news, ipeds, fuzz_on=["state","city","school"])

In [None]:
##Second Attempt :)


##Function to Create Fuzzy Matches By-Variable Crosswalk Between List of Values A and List of Values B
def create_crosswalk(list_a, list_b, byvar, cutoff):
    matches = []
    
    for item in list_a:
        match = process.extractOne(each, list_b, score_cutoff=cutoff)
        
        if fuzz_match==None:
            matches.append({"byvar": each, "matchvar": np.nan, "fuzzy_match_score": np.nan})
        else:
            matches.append({"byvar": each, "matchvar": fuzzy_match[0], "fuzzy_match_score": fuzzy_match[1]})
      
    return pd.DataFrame(matches)


##Function to Impliment the Fuzzy Merge
def fuzzy_merge(a, b, fuzz_on, how="left", cutoffs):

    ##First, Do a Regular Pandas Join & Output Matches and Nonmatches
    a["in_a"] = 1 #Set Flag for Being in DF A
    b["in_b"] = 1 #Set Flag for Being in DF B
    
    merged = pd.merge(a, b, how="outer", on=on)
    
    matched = merged[(merged.in_a.notnull()) & (merged.in_b.notnull())].copy()
    matched["fuzz_match_score"] = 1.0 #score for all matched is 1.0 by default
    
    nomatch_a = merged[(merged.in_a.notnull()) & (merged.in_b.isnull())].copy()
    nomatch_b = merged[(merged.in_a.isnull()) & (merged.in_b.notnull())].copy()
    
    ##If no nonmatches, we're done :). Return Matched Dataframe
    if nomatch_a.shape[0]==0 or nomatch_b.shape[0]==0:
        return matched
    
    ##Otherwise, Proceed to fuzzy matching. Fuzzy Merge on first by-variable, than second, etc.
    else:
        fuzzy_matches = pd.DataFrame() #initialize empty dataframe to hold matches
        
        i = 0
        slice_a = nomatch_a.copy()
        slice_b = nomatch_b.copy()
        
        while i < len(fuzz_on):
            byvar=fuzz_on[i]
            cutoff = cutoffs[i]
            
            list_a = slice_a[byvar].unique().tolist()
            list_b = slice_a[byvar].unique().tolist()
            
            crosswalk = 
            
            i+=1
        
        
        ##Merge Dataframe A Nonmatches to Dataframe B Nonmatches. Append to Exact Matches and Return
        nomatch_b.rename(columns={"byvar":"matchvar"})
        fuzzy_merge1 = pd.merge(nomatch_a, fuzzy_matches, on="byvar", how="left")
        fuzzy_merge2 = pd.merge(nomatch_a, nomatch_a, on="matchvar", how=how)
        
        return pd.concat([matched, fuzzy_merge2], axis=0).drop(["byvar", "merged"])

In [None]:
def partial_fuzzy_merge(a, b, exact_on, fuzz_on, how="left", cutoffs):
    pass