In [1]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import Levenshtein
import jellyfish
import pandas as pd
import operator
from multiprocessing import Pool
from collections import Counter
import re, string
import numpy as np

# LinkSight Location Matching Algo 

Objective: Write a string matching algo that can process 2000 Philippine location in under 2 minutes and return the correct result 95% of the time. Using n-grams method to speed up performance. N-grams are contiguous sequences of n items from a given sample of text or speech. Breaking words and phrases into n-grams is a technique for narrowing the search space when doing fuzzy matching.

## Import Philippine Standard Geographic Code reference file

In [38]:
psgc = pd.read_csv("clean-psgc.csv.gz",dtype={'code':'object'},compression="gzip",encoding="utf-8")

In [39]:
psgc.location = psgc.location.str.encode("utf-8")

In [40]:
psgc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52214 entries, 0 to 52213
Data columns (total 4 columns):
code          52214 non-null object
location      52214 non-null object
interlevel    52210 non-null object
original      52214 non-null bool
dtypes: bool(1), object(3)
memory usage: 1.2+ MB


In [41]:
psgc.head()

Unnamed: 0,code,location,interlevel,original
0,10000000,REGION I (ILOCOS REGION),Reg,True
1,10000000,ILOCOS REGION,Reg,False
2,10000000,REGION 1,Reg,False
3,10000000,REGION I,Reg,False
4,12800000,ILOCOS NORTE,Prov,True


In [42]:
psgc = psgc[psgc.location.isin(["CAPITAL","NOT A PROVINCE"])==False].reset_index(drop=True)

In [43]:
#interlevel lower
psgc.interlevel = psgc.interlevel.str.lower().str.encode("utf-8")

#Clean location column a bit. but not too much because we'll use this as the "canonical" name
psgc['location'] = psgc.location.str.replace(r"NOT A PROVINCE|CAPITAL|\(|\)|CITY OF|CITY","").str.strip()

# make another column that will be even more clean hehe
#psgc['location_clean'] = psgc.location
#psgc.location_clean = psgc.location_clean.str.replace(r"BARANGAY","BGY")
#psgc.location_clean = psgc.location_clean.str.replace(r"POBLACION","POB")
#psgc.location_clean = psgc.location_clean.str.replace("Ñ","N")

In [47]:
nsew = re.compile(r"^NORTH(ERN)? |^SOUTH(ERN)? |^EAST(ERN?)? |^WEST(ERN)? ")
nsew_locs = psgc[psgc.location.str.contains(nsew)]
nsew_abbrev = psgc[psgc.location.str.contains(nsew)].location.str.split().str.get(0).str.slice(0,1)
nsew_abbrev.head()

  from ipykernel import kernelapp as app
  app.launch_new_instance()


7289    E
7290    E
7291    E
7292    E
7306    W
Name: location, dtype: object

In [48]:
nsew_locs.head()

Unnamed: 0,code,location,interlevel,original
7289,30812001,EAST CALAGUIMAN,bgy,False
7290,30812001,EAST CALAGUIMAN POB.,bgy,True
7291,30812002,EAST DAANG BAGO POB.,bgy,True
7292,30812002,EAST DAANG BAGO,bgy,False
7306,30812014,WEST CALAGUIMAN POB.,bgy,True


In [49]:
nsew_locs.loc[:,'location'] = nsew_abbrev.str.cat(psgc[psgc.location.str.contains(nsew)].location.str.replace("^NORTH(ERN)? |^SOUTH(ERN)? |^EAST(ERN?)? |^WEST(ERN)? ","").str.strip(),sep=" ")
nsew_locs.loc[:,"original"] = False
nsew_locs.head()

  if __name__ == '__main__':


Unnamed: 0,code,location,interlevel,original
7289,30812001,E CALAGUIMAN,bgy,False
7290,30812001,E CALAGUIMAN POB.,bgy,False
7291,30812002,E DAANG BAGO POB.,bgy,False
7292,30812002,E DAANG BAGO,bgy,False
7306,30812014,W CALAGUIMAN POB.,bgy,False


In [50]:
psgc = pd.concat([psgc, nsew_locs],ignore_index=True)
psgc.head()

Unnamed: 0,code,location,interlevel,original
0,10000000,REGION I ILOCOS REGION,reg,True
1,10000000,ILOCOS REGION,reg,False
2,10000000,REGION 1,reg,False
3,10000000,REGION I,reg,False
4,12800000,ILOCOS NORTE,prov,True


In [51]:
#fill interlevels for isabela, cotabato

psgc.loc[psgc.interlevel.isnull(),"interlevel"] = u"city"

What we need is reference file that contains the higher-level administrative territories in separate columns. This is so we can create a single "master string" that we will use for matching. We'll try this instead of matching each component individually.

First, create a dictionary of the rankings of various administrative levels.

In [52]:
adm_rank= {u'reg':1,u'prov':2,u'dist':2,u'city':3,u'mun':3,u'municity':3,u'submun':3,u'bgy':4}
adm_rank_list = sorted(adm_rank, key=lambda k: adm_rank[k])
adm_rank_list
psgc['adm_rank'] = psgc.interlevel.map(adm_rank)

In [53]:
psgc.interlevel = psgc.interlevel.replace({"mun":"municity","city":"municity"})
psgc.interlevel.value_counts()

bgy         50154
municity     1985
prov           91
reg            64
dist           20
submun         14
Name: interlevel, dtype: int64

We'll apply this later as a separate column.

Create a function that will add to our dataframe columns with the PSGC codes
of each location's higher level administrative territories. We'll then use this to fill the name columns 
with their corresponding place names. Recall the structure of a 9-digit PSG code and how the components correspond to different interlevels:

![PSGC structure](psgc.gif)



In [54]:
def fill_higher_level_codes(df):
        
    # Below is a dictionary of administrative hierarchy levels ranks and the stop string positions inside the PSG code.
    
    adm_rank = {1:2,
                2:4,
                3:6,
                4:9} 
    
    # Loop through each administrative level. 
    # Create additional columns for each administrative level with the suffixes _code and _name.
    # Fill each column with the names and codes of the higher level administrative territories in which
    # a place is located.
    
    for adm_level in adm_rank.keys():
        
        #create code cols        
        adm_code_col = u"adm"+str(adm_level)+u"_code"
        df[adm_code_col] = None
        
        #find the administrative levels that are higher than the current one
        
        higher_adm_levels = [l for l in adm_rank.keys() if l <= adm_level]
                
        for higher_level in higher_adm_levels:
            
            #higher adm level colum names
            
            higher_level_code_col = u"adm"+str(higher_level)+u"_code"
            
            #stop position of PSG code for this adm level
            
            stop_position = adm_rank[higher_level] 
            
            #derive higher level admin codes for each row
            
            codes = df.loc[df.adm_rank >= higher_level,"code"].str.slice(start=0,stop=stop_position).str.pad(9,side="right",fillchar="0").str.encode("utf-8")           
            df.loc[df.adm_rank >= higher_level,higher_level_code_col] = codes
            
            #derive higher level admin names for each row
            
            higher_level_name_col = u"adm"+str(higher_level)+u"_name"

    return df

In [55]:
psgc_unpivot = fill_higher_level_codes(psgc)
psgc_unpivot.head(10)

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,
1,10000000,ILOCOS REGION,reg,False,1,10000000,,,
2,10000000,REGION 1,reg,False,1,10000000,,,
3,10000000,REGION I,reg,False,1,10000000,,,
4,12800000,ILOCOS NORTE,prov,True,2,10000000,12800000.0,,
5,12801000,ADAMS,municity,True,3,10000000,12800000.0,12801000.0,
6,12801001,ADAMS POB.,bgy,True,4,10000000,12800000.0,12801000.0,12801001.0
7,12801001,ADAMS,bgy,False,4,10000000,12800000.0,12801000.0,12801001.0
8,12802000,BACARRA,municity,True,3,10000000,12800000.0,12802000.0,
9,12802001,BANI,bgy,True,4,10000000,12800000.0,12802000.0,12802001.0


In [56]:
all_loc_names = psgc_unpivot[['code','location']].rename(columns={'code':'join_code'})
all_loc_names.head()

Unnamed: 0,join_code,location
0,10000000,REGION I ILOCOS REGION
1,10000000,ILOCOS REGION
2,10000000,REGION 1
3,10000000,REGION I
4,12800000,ILOCOS NORTE


Add Region names.

In [57]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm1_name'}),how="left",left_on="adm1_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION


Add Prov names.

In [58]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm2_name'}),how="left",left_on="adm2_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm2_name
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,


Add MuniCity names.

In [59]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm3_name'}),how="left",left_on="adm3_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm2_name,adm3_name
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,,


Add Barangay names.

In [60]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm4_name'}),how="left",left_on="adm4_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm2_name,adm3_name,adm4_name
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,,,


In [61]:
psgc_unpivot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 493694 entries, 0 to 493693
Data columns (total 13 columns):
code          493694 non-null object
location      493694 non-null object
interlevel    493694 non-null object
original      493694 non-null bool
adm_rank      493694 non-null int64
adm1_code     493694 non-null object
adm2_code     493438 non-null object
adm3_code     492606 non-null object
adm4_code     478943 non-null object
adm1_name     493694 non-null object
adm2_name     493438 non-null object
adm3_name     492606 non-null object
adm4_name     478943 non-null object
dtypes: bool(1), int64(1), object(11)
memory usage: 49.4+ MB


In [62]:
psgc_unpivot = psgc_unpivot.drop_duplicates(keep="first")
psgc_unpivot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399643 entries, 0 to 493689
Data columns (total 13 columns):
code          399643 non-null object
location      399643 non-null object
interlevel    399643 non-null object
original      399643 non-null bool
adm_rank      399643 non-null int64
adm1_code     399643 non-null object
adm2_code     399387 non-null object
adm3_code     398570 non-null object
adm4_code     386871 non-null object
adm1_name     399643 non-null object
adm2_name     399387 non-null object
adm3_name     398570 non-null object
adm4_name     386871 non-null object
dtypes: bool(1), int64(1), object(11)
memory usage: 40.0+ MB


Create a "location tuple" that concatenates all the location components names into a single tuple. We'll use this for fuzzy matching later.

In [63]:
def normalize_text(item):
    replacements = {"barangay|brgy":"bgy",
                   "[^a-zA-Z0-9_\s]":"",
                   "poblaciohn":"pob",
                   "ñ":"n"}
    
    for k, v in replacements.items():
        item = re.sub(k,v,item.lower())
    return item

In [64]:
def create_loc_tuple_with_code(row):
    def normalize_text(item):
        replacements = {"barangay|brgy":"bgy",
                       "[^a-zA-Z0-9_\s]":"",
                       "poblaciohn":"pob",
                       "ñ":"n"}

        for k, v in replacements.items():
            item = re.sub(k,v,item.lower())
        return item
    return tuple([normalize_text(v) for v in [row.adm4_name,row.adm3_name,row.adm2_name,row.interlevel,row.code] if (v is not None) and (v is not np.nan)])

Disregard making location tuples for regions for now.

In [66]:
exclude_regions = psgc_unpivot[(psgc_unpivot.interlevel <> "reg") & psgc_unpivot.location.notnull()]
exclude_regions.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm2_name,adm3_name,adm4_name
16,12800000,ILOCOS NORTE,prov,True,2,10000000,12800000,,,REGION I ILOCOS REGION,ILOCOS NORTE,,
17,12800000,ILOCOS NORTE,prov,True,2,10000000,12800000,,,ILOCOS REGION,ILOCOS NORTE,,
18,12800000,ILOCOS NORTE,prov,True,2,10000000,12800000,,,REGION 1,ILOCOS NORTE,,
19,12800000,ILOCOS NORTE,prov,True,2,10000000,12800000,,,REGION I,ILOCOS NORTE,,
20,12801000,ADAMS,municity,True,3,10000000,12800000,12801000.0,,REGION I ILOCOS REGION,ILOCOS NORTE,ADAMS,


In [67]:
psgc_unpivot.loc[exclude_regions.index,'loc_tuple'] = exclude_regions.apply(create_loc_tuple_with_code,axis=1)
psgc_unpivot.head(10)

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm2_name,adm3_name,adm4_name,loc_tuple
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,,,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,,,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,,,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,,,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,,,,
5,10000000,ILOCOS REGION,reg,False,1,10000000,,,,ILOCOS REGION,,,,
6,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION 1,,,,
7,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I,,,,
8,10000000,REGION 1,reg,False,1,10000000,,,,REGION I ILOCOS REGION,,,,
9,10000000,REGION 1,reg,False,1,10000000,,,,ILOCOS REGION,,,,


In [68]:
psgc_unpivot.loc_tuple.dropna()[:10]

16                     (ilocos norte, prov, 012800000)
17                     (ilocos norte, prov, 012800000)
18                     (ilocos norte, prov, 012800000)
19                     (ilocos norte, prov, 012800000)
20          (adams, ilocos norte, municity, 012801000)
21          (adams, ilocos norte, municity, 012801000)
22          (adams, ilocos norte, municity, 012801000)
23          (adams, ilocos norte, municity, 012801000)
24    (adams pob, adams, ilocos norte, bgy, 012801001)
25        (adams, adams, ilocos norte, bgy, 012801001)
Name: loc_tuple, dtype: object

In [69]:
canonical_names = psgc_unpivot.groupby('code')[['adm4_name',"adm3_name","adm2_name"]].first()
canonical_names = canonical_names.dropna(how="all").drop_duplicates().rename(columns={'adm4_name':'bgy','adm3_name':'municity','adm2_name':'prov'})
canonical_names

Unnamed: 0_level_0,bgy,municity,prov
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
012800000,,,ILOCOS NORTE
012801000,,ADAMS,ILOCOS NORTE
012801001,ADAMS POB.,ADAMS,ILOCOS NORTE
012802000,,BACARRA,ILOCOS NORTE
012802001,BANI,BACARRA,ILOCOS NORTE
012802002,BUYON,BACARRA,ILOCOS NORTE
012802003,CABARUAN,BACARRA,ILOCOS NORTE
012802004,CABULALAAN,BACARRA,ILOCOS NORTE
012802005,CABUSLIGAN,BACARRA,ILOCOS NORTE
012802006,CADARATAN,BACARRA,ILOCOS NORTE


In [70]:
canonical_names.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43759 entries, 012800000 to 175917006
Data columns (total 3 columns):
bgy         42028 non-null object
municity    43675 non-null object
prov        43759 non-null object
dtypes: object(3)
memory usage: 1.3+ MB


In [71]:
psgc_locations = psgc_unpivot[['loc_tuple','code']].dropna().drop_duplicates(keep="first")

psgc_locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73447 entries, 16 to 490266
Data columns (total 2 columns):
loc_tuple    73447 non-null object
code         73447 non-null object
dtypes: object(2)
memory usage: 1.7+ MB


In [72]:
psgc_locations = psgc_locations.merge(canonical_names,left_on="code",right_index=True,how="left")

psgc_locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73447 entries, 16 to 490266
Data columns (total 5 columns):
loc_tuple    73447 non-null object
code         73447 non-null object
bgy          71065 non-null object
municity     73294 non-null object
prov         73400 non-null object
dtypes: object(5)
memory usage: 3.4+ MB


In [73]:
def to_string(row):
    return ",".join(row.loc_tuple)

In [74]:
psgc_locations['loc_tuple'] = psgc_locations.apply(to_string,axis=1)

In [77]:
psgc_locations['candidate_terms'] = psgc_locations['loc_tuple'].str.rsplit(',',n=1).str.get(0)

In [78]:
psgc_locations = psgc_locations.set_index('loc_tuple')
psgc_locations.head()

Unnamed: 0_level_0,code,bgy,municity,prov,candidate_terms
loc_tuple,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"ilocos norte,prov,012800000",12800000,,,ILOCOS NORTE,"ilocos norte,prov"
"adams,ilocos norte,municity,012801000",12801000,,ADAMS,ILOCOS NORTE,"adams,ilocos norte,municity"
"adams pob,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,"adams pob,adams,ilocos norte,bgy"
"adams,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,"adams,adams,ilocos norte,bgy"
"bacarra,ilocos norte,municity,012802000",12802000,,BACARRA,ILOCOS NORTE,"bacarra,ilocos norte,municity"


In [80]:
psgc_locations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73447 entries, ilocos norte,prov,012800000 to san isidro,imelda,romblon,bgy,175917006
Data columns (total 5 columns):
code               73447 non-null object
bgy                71065 non-null object
municity           73294 non-null object
prov               73400 non-null object
candidate_terms    73447 non-null object
dtypes: object(5)
memory usage: 3.4+ MB


In [81]:
psgc_locations.to_csv('psgc-locations.csv.gz',compression="gzip")