#### Prototype FuzzyPandas Functions using U.S. News and IPEDS Data

In [1]:
import time
import functools

import numpy as np
import pandas as pd

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

**First, let's read in the school ranking data scraped from usnews.com & do a little pre-processing**

In [2]:
def read_us_news(pickle):
    df = pd.read_pickle(pickle)   
    return pd.concat([us_news, df], axis=0, ignore_index=True)

us_news = pd.DataFrame() #initialize empty data frame
us_news = read_us_news("example_data/us_news/usnews-ranking-national-universities.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-national-liberal-arts-colleges.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-colleges-west.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-midwest.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-north.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-south.pickle")
us_news = read_us_news("example_data/us_news/usnews-ranking-regional-universities-west.pickle")

us_news["city"] = us_news.location.apply(lambda x: x.split(", ")[0].upper().strip()) #parse out city, to upper case
us_news["state"] = us_news.location.apply(lambda x: x.split(", ")[1].upper().strip()) #parse out state, to upper case
us_news["school"] = us_news.school.apply(lambda x: x.upper().strip()) #school name to upper case

us_news.drop("location", axis=1, inplace=True) #drop original location (now that we've split out into city and state)
us_news.head(5)

Unnamed: 0,category,school,score,city,state
0,National Universities,PRINCETON UNIVERSITY,100 out of 100.,PRINCETON,NJ
1,National Universities,HARVARD UNIVERSITY,99 out of 100.,CAMBRIDGE,MA
2,National Universities,YALE UNIVERSITY,97 out of 100.,NEW HAVEN,CT
3,National Universities,COLUMBIA UNIVERSITY,95 out of 100.,NEW YORK,NY
4,National Universities,STANFORD UNIVERSITY,95 out of 100.,STANFORD,CA


**Read in IPEDS Data & Preprocess. IPEDS Data can be downloaded for free here:**
https://nces.ed.gov/ipeds/datacenter/Default.aspx

In [3]:
ipeds=pd.read_csv("example_data/ipeds/HD2014.csv")[["UNITID","INSTNM","CITY","STABBR"]]

ipeds.rename(columns={"UNITID":"unitid","INSTNM":"school","CITY":"city","STABBR":"state"},inplace=True) 
#rename columns to match U.S. News Data

ipeds["school"] = ipeds.school.apply(lambda x: x.upper().strip()) #school to upper case
ipeds["city"] = ipeds.city.apply(lambda x: x.upper().strip()) #city to upper case
ipeds["state"] = ipeds.state.apply(lambda x: x.upper().strip()) #state to upper case
ipeds.head(5)

Unnamed: 0,unitid,school,city,state
0,100636,COMMUNITY COLLEGE OF THE AIR FORCE,MONTGOMERY,AL
1,100654,ALABAMA A & M UNIVERSITY,NORMAL,AL
2,100663,UNIVERSITY OF ALABAMA AT BIRMINGHAM,BIRMINGHAM,AL
3,100690,AMRIDGE UNIVERSITY,MONTGOMERY,AL
4,100706,UNIVERSITY OF ALABAMA IN HUNTSVILLE,HUNTSVILLE,AL


** Subset Data to Only NY and CT Schools for Testing**

In [4]:
test_us_news = us_news[us_news.state.isin(["CT","NY"])].copy()
test_ipeds = ipeds[ipeds.state.isin(["CT","NY"])].copy()

print us_news.shape, test_us_news.shape
print ipeds.shape, test_ipeds.shape

(1506, 5) (142, 5)
(7687, 4) (574, 4)


#### Basic Fuzzy Matching Functions

In [None]:
def decode_text(text):
    if type(text) == unicode:
        return text.encode("ascii", "replace")
    else:
        return text

def get_best_match(value, choices, scorer, cutoff):
    pass

def match_keys(dfA, dfB, key, scorer=fuzz.ratio, cutoffs=0.6):
    start = time.time()
    
    rowsA = dfA.drop_duplicates(subset=key)[key].tolist() #Convert Data Frame -> List of Dicts
    rowsB = dfB.drop_duplicates(subset=key)[key].tolist() #Convert Data Frame -> List of Dicts
    
    for rowA in rowsA:
        max_score = 0
        
        for rowB in rowsB:
             score = scorer(decode_text(rowA[key]), decode_text(rowB[key]))
                                   
            if score > cutoff and score > max_score:
                match = rowB[key]
                max_score = score
                
        rowA[key+"_match"] = match
        rowA[key+"_match_score"] = max_score
        
        end=time.time()
        
    return pd.DataFrame(rowA)

##do this? or just append keys together?
def hmatch_keys(dfA, dfB, keys, scorer=fuzz.ratio, cutoffs=None):
    start = time.time()
       
    rowsA = dfA.drop_duplicates(subset=keys)[keys].T.to_dict().values() #Convert Data Frame -> List of Dicts
    rowsB = dfB.drop_duplicates(subset=keys)[keys].T.to_dict().values() #Convert Data Frame -> List of Dicts
    
    if cutoffs is None:
        cutoffs = [0.6] * len(keys)
    
    for key, cutoff in zip(keys, cutoffs): #Iterate Through Keys & Cutoffs
        for rowA in rowsA: #Iterate Through Rows in first dataframe
            
            max_score = 0 #Initialize Best Match Score as 0
            
            for rowB in rowsB:
                if (filter not in rowB.keys()) or (rowA["filter"] == rowB["filter"]):
                    score = scorer(decode_text(rowA[key]), decode_text(rowB[key]))
                                   
                    if score > cutoff and score > max_score:
                        match = rowB[key]
                        max_score = score
                
                rowB["filter"] = rowB.get("filter", "")+"-"+rowB[key]
            
            rowA[key+"_match"] = match
            rowA[key+"_match_score"] = max_score
            rowA["filter"] = rowA.get("filter", "")+"-"+match
        
    end = time.time()
    print "Duration: ", end-start, "Seconds"
        
    return pd.DataFrame(rowsA).drop("filter", axis=1)

In [None]:
def fuzzy_merge(dfA, dfB, key, how="left", scorer=fuzz.ratio, cutoff=None, force_single=False):
    dfA["inA"] = 1 #Flag rows existing in DF A
    dfB["inB"] = 1 #Flag rows existing in DF A
    
    merged = pd.merge(a, b, on=key, how="outer") #Perform a regular (exact) join
    
    matched = merged[(merged.in_a.notnull()) & (merged.in_b.notnull())].copy() #Sift out exact matches
    nomatchA = merged[(merged.in_a.notnull() | (merged.in_b.isnull()))].copy() #Sift out nonmatches from DF A
    nomatchB = merged[(merged.in_a.isnull()) | (merged.in_b.notnull())].copy() #Sift out nomathches from DF B
    
    if nomatch_a.shape[0]==0 | nomatch_b.shape[0]==0: #No nomatches. Return matched DF and end.
        return matched
    
    elif len(key) == 1: #Fuzzy Matching-Single Key
        keyfile = single_key_match(dfA, dfB, key, cutoff)
    
    elif force_single == True: #Fuzzy Matching-Multi Key Non-Heirarchical
        nomatchA[]

    else: #Fuzzy Matching-Multi Key Heirarchical
        pass
    
    fuzzyMerged1 = pd.merge(nomatchA, keyfile, on=key+"A", how="left") #Match DF A to crosswalk on Key 
    fuzzyMerged2 = pd.merge(fuzzyMerged1, nomatchB, on=key+"B", how=how) #Match DF A+Xwalk to DF B on Key B

    return pd.concat([matched, fuzzyMerged2], axis=0) #Append fuzzy matches to exact matches and return (MAKE DROPS)