In [4]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import Levenshtein
import jellyfish
import pandas as pd
import operator
from multiprocessing import Pool
from collections import Counter
import re, string
import numpy as np
pd.set_option('display.max_rows', 500)

# LinkSight Location Matching Algo 

Objective: Write a string matching algo that can process 2000 Philippine location in under 2 minutes and return the correct result 95% of the time. Using n-grams method to speed up performance. N-grams are contiguous sequences of n items from a given sample of text or speech. Breaking words and phrases into n-grams is a technique for narrowing the search space for doing fuzzy matching, which is computationally expensive.

## Import Philippine Standard Geographic Code reference file

In [5]:
psgc = pd.read_csv('psgc-locations.csv.gz',compression="gzip")
psgc.candidate_terms = psgc.loc_tuple.str.encode('utf-8').str.split(",").apply(tuple)
psgc.head()

Unnamed: 0,loc_tuple,code,bgy,municity,prov,candidate_terms
0,"ilocos norte,prov,012800000",12800000,,,ILOCOS NORTE,"(ilocos norte, prov, 012800000)"
1,"adams,ilocos norte,municity,012801000",12801000,,ADAMS,ILOCOS NORTE,"(adams, ilocos norte, municity, 012801000)"
2,"adams pob,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,"(adams pob, adams, ilocos norte, bgy, 012801001)"
3,"adams,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,"(adams, adams, ilocos norte, bgy, 012801001)"
4,"bacarra,ilocos norte,municity,012802000",12802000,,BACARRA,ILOCOS NORTE,"(bacarra, ilocos norte, municity, 012802000)"


## Create N-Gram Table

Turn the reference file into a dictionary of n-grams and their associated loc-phrases.

The purpose of this is to narrow down the number of candidate terms for fuzzy matching. It takes too long to do fuzzy matching on all 55k+ locations in the reference dataset. Instead, we'll break the lowest level item in each location phrase into 2-part n-grams. We'll then create a dictionary in which the keys are the unique n-grams and the values are all the location phrases that contain at least one instance of the said n-gram.

In [43]:
#Helper function that creates NGrams. does not include spaces

def makeNgram(string,n):
    string = re.sub("[^a-zA-Z0-9]+","",string.lower())
    ngrams = []
    for i in range(0,len(string)-(n-1)):
        ngram = string[i:i+n]
        ngrams.append(ngram)
    return list(set(ngrams)) #return only the unique n-grams. the same string can have repeats

In [44]:
def generate_ngram_table(loc_tuples,n):
    
    # create the dict
    ngram_table = {}
    
    #for each unique location phrase
    
    for loc in loc_tuples:
        
        #take each unique part in that tuple and extract the n-grams
        
        item = loc[0]
            
        item = re.sub("[^a-zA-Z0-9]+",u"",item.lower())
            
        #for each of these parts, extract the n-grams
            
        for i in range(0,len(item)-(n-1)):
            
            ngram = item[i:i+n].lower()
                
            #if the n-gram is not yet in the table, add it as a new key for which value is empty list
                
            if ngram not in ngram_table.keys():                                       
                ngram_table[ngram] = {loc}
                
            else:
                ngram_table[ngram].add(loc)
                
    return ngram_table

In [45]:
loc_phrase_ngram_table = generate_ngram_table(list(psgc.candidate_terms.dropna(how="all").unique()),2)

In [46]:
len(loc_phrase_ngram_table)

953

# Matching algorithm

There are four functions that work together to produce the matching algo:

1. **searchThruReference** - Takes a searchString then narrows down and scores possible matches based on common n-grams.
    - searchTuple - a tuple of strings you're trying to match. last item is always the administrative level.
    - ngram_table - the reference table
    - nresults - if no exact match found, max N results you want returned        
2. **searchThruShortlist** - Helper to `searchThruReference`. Applies scoring to top N most likely candidates.
3. **scoreMatches** - Helper to `searchThruShortlist`. Scores candidate pairs using various fuzzy ratios.
4. **getMatches** - Applies above algo to a list of search strings and returns a dataframe of sources and top N matches.


In [47]:
def searchThruReference(searchTuple,ngram_table,nresults):
    
    possibleMatches = []
    
    # turn the search string into ngrams based on the length of ngrams in the reference table
    
    n = len(ngram_table.keys()[0])
    
    ss_ngrams = []
        
    # get the unique n-grams in the first item in the search tuple
    
    ss_ngrams = set(makeNgram(searchTuple[0],n))
    
    for ngram in list(ss_ngrams):        
        
        # look each n-gram up in the hash list and add the values as possibleMatches
        
        if ngram in ngram_table:
            
            possibleMatches += ngram_table[ngram]
    
    #eliminate the candidates that have very few n-grams in common with the search terms
    
    threshold = len(ss_ngrams)/3    
    
    mostPossible = [k for k, v in Counter(possibleMatches).items() if v >= threshold]

    # calculate similarity scores of search tuples with each candidate among possible matches
    
    results = pd.DataFrame(searchThruShortlist(searchTuple,mostPossible)).rename(columns={0:'source',1:'match',2:'psgc',3:'score'})
    
    # deduplicate results based on psgc code
    
    topresults = results.sort_values(by="score",ascending=False).reset_index(drop=True).drop_duplicates("psgc",keep="first")[:nresults]
    
    return topresults 
        

In [48]:
def searchThruShortlist(searchTuple,shortlist):
    
    #find exact matches first
    
    exact_match = [candidate for candidate in shortlist if searchTuple == candidate[:-1]]
    
    if exact_match <> []:
        
        psgc_code = exact_match[0][-1]
        
        #exact matches result in perfect score and a single row returned
        
        return pd.DataFrame([(searchTuple, exact_match[0], psgc_code,100)])
                

    else:

        #pair searchString with each possible match

        candidate_pairs = []
        for candidateTuple in shortlist:
            candidate_pairs.append((searchTuple,candidateTuple))

        #use multiprocessing to run fuzzy matching
        pool2 = Pool(2) 
        results = pool2.map(scoreMatches, candidate_pairs)
        pool2.close()
        pool2.join()

        return results

In [50]:
def scoreMatches(tuplePairs,firstItemRatioWgt=0.6,otherItemsRatioWgt=.4,admLevelMatchWgt=1.2):
    
    #split both the searchString and candidateString into their name and interlevel components. 

    searchTuple, candidateTuple = tuplePairs
    searchTerms, searchAdm = searchTuple[:-1], searchTuple[-1]
    candidateTuple = tuplePairs[1]
    candidateTerms, candidateAdm, candidateCode = candidateTuple[:-1], candidateTuple[-2], candidateTuple[-1]

    #if a searchString and the candidate have the same administrative level, this improve the resulting score by a multiplier
    admLevelMatchScore = (admLevelMatchWgt if searchAdm == candidateAdm else 1)
    
    #check on set ratio between secondary search terms
    otherItemsRatio = fuzz.ratio(unicode(" ".join(searchTerms[0:])),unicode(" ".join(candidateTerms[0:])))
    
    #check on jw distance ratio between the very first items in searchString and candidateStrings
    firstItemRatio = jellyfish.jaro_winkler(unicode(searchTerms[0]),unicode(candidateTerms[0])) * 100
                
    #Create a weighted score for the match with weights for each input.
    score = (
        ((firstItemRatio*firstItemRatioWgt) + (otherItemsRatio*otherItemsRatioWgt)) / admLevelMatchWgt
        *  admLevelMatchScore
        )
    
    results = (searchTuple,candidateTuple,candidateCode,score)
    return results

Test on a single string:

In [51]:
searchThruReference((u"polilio",u"quezon",u"municity"),loc_phrase_ngram_table,10) #shortlist first, then check for exact

Unnamed: 0,source,match,psgc,score
0,"(polilio, quezon, municity)","(polillo, quezon, municity, 045636000)",45636000,84.571429
1,"(polilio, quezon, municity)","(iloilo, iloilo, municity, 063022000)",63022000,66.952381
2,"(polilio, quezon, municity)","(pola, oriental mindoro, municity, 175210000)",175210000,66.5
3,"(polilio, quezon, municity)","(polilio, cabiao, nueva ecija, bgy, 034904023)",34904023,65.0
4,"(polilio, quezon, municity)","(polilio, cabanatuan, nueva ecija, bgy, 034903...",34903063,64.0
5,"(polilio, quezon, municity)","(liliw, laguna, municity, 043410000)",43410000,63.428571
6,"(polilio, quezon, municity)","(poblacion, polillo, quezon, bgy, 045636015)",45636015,62.597884
7,"(polilio, quezon, municity)","(pilion, polillo, quezon, bgy, 045636013)",45636013,62.547619
8,"(polilio, quezon, municity)","(malilipot, albay, municity, 050509000)",50509000,62.196825
9,"(polilio, quezon, municity)","(poblacion, lopez, quezon, bgy, 045622095)",45622095,61.931217


In [52]:
searchThruReference((u"polillo",u"quezon",u"municity"),loc_phrase_ngram_table,10) #shortlist first, then check for exact

Unnamed: 0,source,match,psgc,score
0,"(polillo, quezon, municity)","(polillo, quezon, municity, 045636000)",45636000,100


In [53]:
def getMatches(searchTupleList,ngram_table,nresults):
    all_matches = []
    for searchTuple in searchTupleList:
        searchTuple = tuple([item.lower() for item in searchTuple])
        searchTupleMatches = pd.DataFrame(searchThruReference(searchTuple,ngram_table,nresults))
        all_matches.append(searchTupleMatches)
#    return all_matches    
    return pd.concat(all_matches,ignore_index=True).rename(columns={0:'source',1:'match',2:'psgc',3:'firstItemRatio',4:'otherItemsRatio',5:'admLevelMatch',6:'total'})

In [54]:
sample_list = [(u"Ilocos Sur",u"prov"),
               (u"Fort Bonifacio",u"Taguig",u"bgy"),
               (u"Baguio City",u"Benguet",u"city"),
               (u"Dagatan","Lipa", "Batangas", u"prov"),
               (u"Zamboanga City",u"city"),
               (u"Zamboanga Sibugay",u"prov"),
               ("Aga", "Delfin Albano",u"Isabela","bgy"),
               ("Ahin","Ifugao","bgy"),
               ("Santa Catalina","Lubao","Pampanga","bgy"),
               ("Dampalit", "Malabon","bgy"),
               ("San Pablo","Lagun","city"),
               ("Bgy 105","Caloocan","bgy"),
               ("brgy pasong tamo","quezon","bgy"),
               ("Tagaytay","Cavite","bgy")]

In [42]:
getMatches(sample_list,
           ngram_table=loc_phrase_ngram_table,
           nresults=5)

Unnamed: 0,source,match,psgc,score
0,"(ilocos sur, prov)","(ilocos sur, prov, 012900000)",12900000,100.0
1,"(fort bonifacio, taguig, bgy)","(fort bonifacio, taguig, ncr fourth district, ...",137607020,85.6
2,"(fort bonifacio, taguig, bgy)","(andres bonifacio, baguio, benguet, bgy, 14110...",141102117,72.046104
3,"(fort bonifacio, taguig, bgy)","(andres bonifacio, javier, leyte, bgy, 083724002)",83724002,70.046104
4,"(fort bonifacio, taguig, bgy)","(andres bonifacio, diffun, quirino, bgy, 02570...",25703001,69.246104
5,"(fort bonifacio, taguig, bgy)","(andres bonifacio, san joaquin, iloilo, bgy, 0...",63040002,68.846104
6,"(baguio city, benguet, city)","(baguio, benguet, municity, 141102000)",141102000,67.787879
7,"(baguio city, benguet, city)","(baguio, tayabas, quezon, bgy, 045647013)",45647013,65.121212
8,"(baguio city, benguet, city)","(bagu, bakun, benguet, bgy, 141103002)",141103002,64.636364
9,"(baguio city, benguet, city)","(baguinloc, anao, tarlac, bgy, 036901001)",36901001,60.808081


# Notes

## Advantages of this method:
- Improved accuracy.
- Can handle more edge cases of missing location components or misidentified interlevels

## To do list:
- Still gotta improve speed
- Streamline final steps of exporting file