#### Prototype FuzzyPandas Functions using U.S. News and IPEDS Data

In [None]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

**First, let's read in the school ranking data scraped from usnews.com & do a little pre-processing**

In [None]:
def read_us_news(pickle):
    df = pd.read_pickle(pickle)   
    return pd.concat([us_news, df], axis=0, ignore_index=True)

us_news = pd.DataFrame() #initialize empty data frame
us_news = read_us_news("example_data/us_news/usnews-ranking-national-universities.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-national-liberal-arts-colleges.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-west.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-west.pickle")

us_news["city"] = us_news.location.apply(lambda x: x.split(", ")[0].upper().strip()) #parse out city, to upper case
us_news["state"] = us_news.location.apply(lambda x: x.split(", ")[1].upper().strip()) #parse out state, to upper case
us_news["school"] = us_news.school.apply(lambda x: x.upper().strip()) #school name to upper case

us_news.drop("location", axis=1, inplace=True) #drop original location (now that we've split out into city and state)
us_news.head(5)

**Read in IPEDS Data & Preprocess. IPEDS Data can be downloaded for free here:**
https://nces.ed.gov/ipeds/datacenter/Default.aspx

In [None]:
ipeds=pd.read_csv("example_data/ipeds/HD2014.csv")[["UNITID","INSTNM","CITY","STABBR"]]

ipeds.rename(columns={"UNITID":"unitid","INSTNM":"school","CITY":"city","STABBR":"state"},inplace=True) 
#rename columns to match U.S. News Data

ipeds["school"] = ipeds.school.apply(lambda x: x.upper().strip()) #school to upper case
ipeds["city"] = ipeds.city.apply(lambda x: x.upper().strip()) #city to upper case
ipeds["state"] = ipeds.state.apply(lambda x: x.upper().strip()) #state to upper case
ipeds.head(5)

** Instead, let's use FuzzyPandas to match each school in the U.S. News Data to it's CLOSEST, but not necessarily EXACT, U.S. News Match**

In [None]:
##Function to Do a Regular Exact Pandas Join and Return Dataframes of Matches and Nonmatches
def exact_merge(a, b, exact_on):
    a["in_a"] = 1
    b["in_b"] = 1
    
    #Join and return seperate dataframes for matches and non-matches
    merged = pd.merge(a, b, how="outer", on=exact_on)
    matched = merged[(merged.in_a==1) & (merged.in_b==1)].copy()
    nomatch = merged[(merged.in_a.isnull()) | (merged.in_b.isnull())].copy()
    
    matched["fuzz_match_score"] = 1.0 #score for all matched is 1.0 by default
    
    return matched, nomatch

In [None]:
exact = exact_merge(us_news, ipeds, exact_on=["state","city","school"])
print "Matched:     ", exact[0].shape[0]
print "Non-Matched: ", exact[1].shape[0]

In [None]:
##Function to Do a Fuzzy Match
##This on takes way to long to run; need to find a way to make more efficient
def fuzzy_merge(a, b, fuzz_on, how="left", score_cutoff=0.6):
    merged = exact_merge(a=a, b=b, exact_on=fuzz_on) #Run Exact Merge
    matched = merged[0] #Pull Out matches
    nomatch = merged[1] #Pull Out Nonmatched
    
    ##If No Nonmatched, we're done :). Just Return Matched Dataframe
    if nomatch.shape[0] == 0: #if no non-matches, just return the merged dataframe
        return matched
    
    ##Otherwise, Proceed to Fuzzy Matching Non-Matches
    else:
        nomatch_a = nomatch[nomatch.in_a==1].copy()
        nomatch_b = nomatch[nomatch.in_b==1].copy()
        
        ##If Multiple By-Variables, String them Together for Fuzzy Merge. Otherwise, Use Single By-Varaiable
        if len(fuzz_on) > 1:
            nomatch_a["byvar"] = nomatch_a[fuzz_on].apply(lambda x: " ".join(x), axis=1)
            nomatch_b["byvar"] = nomatch_b[fuzz_on].apply(lambda x: " ".join(x), axis=1)
        else:
            nomatch_a["byvar"] = nomatch_a[fuzz_on]
            nomatch_b["byvar"] = nomatch_b[fuzz_on]
        
        ##For Each Nonmatch in Dataframe A, Use FuzzyWuzzy to Match to Closest Dataframe B Nonmatched
        fuzzy_matches=[]
        for each in nomatch_a.byvar.unique():
            fuzzy_match = process.extractOne(each, nomatch_b.byvar.unique(), score_cutoff=score_cutoff)
            
            if fuzzy_match==None:
                fuzzy_matches.append({"byvar": each, "matchvar": np.nan, "fuzzy_match_score": np.nan})
            else:
                fuzzy_matches.append({"byvar": each, "matchvar": fuzzy_match[0], "fuzzy_match_score": fuzzy_match[1]})
                
        fuzzy_matches = pd.DataFrame(fuzzy_matches)
        
        ##Merge Dataframe A Nonmatches to Dataframe B Nonmatches. Append to Exact Matches and Return
        nomatch_b.rename(columns={"byvar":"matchvar"})
        fuzzy_merge1 = pd.merge(nomatch_a, fuzzy_matches, on="byvar", how="left")
        fuzzy_merge2 = pd.merge(nomatch_a, nomatch_a, on="matchvar", how=how)
        
        return pd.concat([matched, fuzzy_merge2], axis=0).drop(["byvar", "merged"])

In [None]:
ny_us_news = us_news[us_news.state=="NY"].copy()
ny_ipeds = ipeds[ipeds.state=="NY"].copy()

print ny_us_news.shape, ny_ipeds.shape

In [None]:
ny_test = fuzzy_merge(us_news, ipeds, fuzz_on=["state","city","school"])

In [None]:
##Second Attempt :)


##Function to Create Fuzzy Matches By-Variable Crosswalk Between List of Values A and List of Values B
def create_crosswalk(list_a, list_b, byvar, cutoff):
    matches = []
    
    for item in list_a:
        match = process.extractOne(each, list_b, score_cutoff=cutoff)
        
        if fuzz_match==None:
            matches.append({"byvar": each, "matchvar": np.nan, "fuzzy_match_score": np.nan})
        else:
            matches.append({"byvar": each, "matchvar": fuzzy_match[0], "fuzzy_match_score": fuzzy_match[1]})
      
    return pd.DataFrame(matches)


##Function to Impliment the Fuzzy Merge
def fuzzy_merge(a, b, fuzz_on, how="left", cutoffs):

    ##First, Do a Regular Pandas Join & Output Matches and Nonmatches
    a["in_a"] = 1 #Set Flag for Being in DF A
    b["in_b"] = 1 #Set Flag for Being in DF B
    
    merged = pd.merge(a, b, how="outer", on=on)
    
    matched = merged[(merged.in_a.notnull()) & (merged.in_b.notnull())].copy()
    matched["fuzz_match_score"] = 1.0 #score for all matched is 1.0 by default
    
    nomatch_a = merged[(merged.in_a.notnull()) & (merged.in_b.isnull())].copy()
    nomatch_b = merged[(merged.in_a.isnull()) & (merged.in_b.notnull())].copy()
    
    ##If no nonmatches, we're done :). Return Matched Dataframe
    if nomatch_a.shape[0]==0 or nomatch_b.shape[0]==0:
        return matched
    
    ##Otherwise, Proceed to fuzzy matching. Fuzzy Merge on first by-variable, than second, etc.
    else:
        fuzzy_matches = pd.DataFrame() #initialize empty dataframe to hold matches
        
        i = 0
        slice_a = nomatch_a.copy()
        slice_b = nomatch_b.copy()
        
        while i < len(fuzz_on):
            byvar=fuzz_on[i]
            cutoff = cutoffs[i]
            
            list_a = slice_a[byvar].unique().tolist()
            list_b = slice_a[byvar].unique().tolist()
            
            crosswalk = 
            
            

    

    
  
    
            
            
        
        
            
            
            
            
            i+=1
        
    


        
        ##Merge Dataframe A Nonmatches to Dataframe B Nonmatches. Append to Exact Matches and Return
        nomatch_b.rename(columns={"byvar":"matchvar"})
        fuzzy_merge1 = pd.merge(nomatch_a, fuzzy_matches, on="byvar", how="left")
        fuzzy_merge2 = pd.merge(nomatch_a, nomatch_a, on="matchvar", how=how)
        
        return pd.concat([matched, fuzzy_merge2], axis=0).drop(["byvar", "merged"])

In [None]:
def partial_fuzzy_merge(a, b, exact_on, fuzz_on, how="left", cutoffs):
    pass