In [1]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import Levenshtein
import jellyfish
import pandas as pd
import operator
from multiprocessing import Pool
from collections import Counter
import re, string
import numpy as np

# Prepare PSGC Reference File

Objective: Write a string matching algo that can process 2000 Philippine location in under 2 minutes and return the correct result 95% of the time. Using n-grams method to speed up performance. N-grams are contiguous sequences of n items from a given sample of text or speech. Breaking words and phrases into n-grams is a technique for narrowing the search space when doing fuzzy matching.

## Import Philippine Standard Geographic Code reference file

In [2]:
psgc = pd.read_csv("clean-psgc.csv.gz",dtype={'code':'object'},compression="gzip",encoding="utf-8")

In [3]:
psgc.location = psgc.location.str.encode("utf-8")

In [4]:
psgc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52214 entries, 0 to 52213
Data columns (total 4 columns):
code          52214 non-null object
location      52214 non-null object
interlevel    52210 non-null object
original      52214 non-null bool
dtypes: bool(1), object(3)
memory usage: 1.2+ MB


In [5]:
psgc.head()

Unnamed: 0,code,location,interlevel,original
0,10000000,REGION I (ILOCOS REGION),Reg,True
1,10000000,ILOCOS REGION,Reg,False
2,10000000,REGION 1,Reg,False
3,10000000,REGION I,Reg,False
4,12800000,ILOCOS NORTE,Prov,True


In [6]:
psgc = psgc[psgc.location.isin(["CAPITAL","NOT A PROVINCE"])==False].reset_index(drop=True)

In [7]:
#interlevel lower
psgc.interlevel = psgc.interlevel.str.lower().str.encode("utf-8")

#Clean location column a bit. but not too much because we'll use this as the "canonical" name
psgc['location'] = psgc.location.str.replace(r"NOT A PROVINCE|CAPITAL|\(|\)|CITY OF|CITY","").str.strip()
psgc = psgc.drop_duplicates(subset=["code","location","interlevel"],keep="first")

In [8]:
# rename districts as simply "Metro Manila", "Metropolitan Manila", "National Capital Region" or "NCR"

ncr = psgc[(psgc.code == "130000000")]
#ncr.loc[:,"location"] = ncr.location.str.split(",").str.get(0).str.strip()
ncr.loc[:,"location"] = ncr.location.str.replace("NATIONAL  REGION","NATIONAL CAPITAL REGION")
ncr.loc[:,"location"] = ncr.location.str.replace("NATIONAL  REGION","NATIONAL CAPITAL REGION")
ncr = ncr.append(pd.Series({"code":"130000000","location":"METRO MANILA","interlevel":"reg","original":False}),ignore_index=True)
ncr = ncr.append(pd.Series({"code":"130000000","location":"METROPOLITAN MANILA","interlevel":"reg","original":False}),ignore_index=True)
ncr.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,code,location,interlevel,original
0,130000000,NATIONAL CAPITAL REGION NCR,reg,True
1,130000000,NCR,reg,False
2,130000000,NATIONAL CAPITAL REGION,reg,False
3,130000000,METRO MANILA,reg,False
4,130000000,METROPOLITAN MANILA,reg,False


In [9]:
# replace districts and NCR from reference file for now with cleaned up NCR rows

psgc = psgc[psgc.interlevel <> 'dist'].reset_index(drop=True)
psgc = psgc[psgc.code <> '130000000'].reset_index(drop=True)
print len(psgc)
psgc = psgc.append(ncr,ignore_index=True)
print len(psgc)
psgc.head()

51936
51941


Unnamed: 0,code,location,interlevel,original
0,10000000,REGION I ILOCOS REGION,reg,True
1,10000000,ILOCOS REGION,reg,False
2,10000000,REGION 1,reg,False
3,10000000,REGION I,reg,False
4,12800000,ILOCOS NORTE,prov,True


In [10]:
psgc[psgc.code == "130000000"]

Unnamed: 0,code,location,interlevel,original
51936,130000000,NATIONAL CAPITAL REGION NCR,reg,True
51937,130000000,NCR,reg,False
51938,130000000,NATIONAL CAPITAL REGION,reg,False
51939,130000000,METRO MANILA,reg,False
51940,130000000,METROPOLITAN MANILA,reg,False


In [11]:
# add as aliases abbreviations of north, south, east, west. for example, northern samar will have an alias n. samar.

nsew = re.compile(r"^NORTH(ERN)? |^SOUTH(ERN)? |^EAST(ERN?)? |^WEST(ERN)? ")
nsew_abbrev = psgc[psgc.location.str.contains(nsew)].location.str.split().str.get(0).str.slice(0,1)
nsew_abbrev.head()



7271    E
7272    E
7273    E
7274    E
7288    W
Name: location, dtype: object

In [12]:
nsew_locs = psgc[psgc.location.str.contains(nsew)]
nsew_locs.head()

  if __name__ == '__main__':


Unnamed: 0,code,location,interlevel,original
7271,30812001,EAST CALAGUIMAN,bgy,False
7272,30812001,EAST CALAGUIMAN POB.,bgy,True
7273,30812002,EAST DAANG BAGO POB.,bgy,True
7274,30812002,EAST DAANG BAGO,bgy,False
7288,30812014,WEST CALAGUIMAN POB.,bgy,True


In [13]:
nsew_locs.loc[:,'location'] = nsew_abbrev.str.cat(psgc[psgc.location.str.contains(nsew)].location.str.replace("^NORTH(ERN)? |^SOUTH(ERN)? |^EAST(ERN?)? |^WEST(ERN)? ","").str.strip(),sep=" ")
nsew_locs.loc[:,"original"] = False
nsew_locs.head()

  if __name__ == '__main__':


Unnamed: 0,code,location,interlevel,original
7271,30812001,E CALAGUIMAN,bgy,False
7272,30812001,E CALAGUIMAN POB.,bgy,False
7273,30812002,E DAANG BAGO POB.,bgy,False
7274,30812002,E DAANG BAGO,bgy,False
7288,30812014,W CALAGUIMAN POB.,bgy,False


In [14]:
psgc = pd.concat([psgc, nsew_locs],ignore_index=True)
psgc.head()

Unnamed: 0,code,location,interlevel,original
0,10000000,REGION I ILOCOS REGION,reg,True
1,10000000,ILOCOS REGION,reg,False
2,10000000,REGION 1,reg,False
3,10000000,REGION I,reg,False
4,12800000,ILOCOS NORTE,prov,True


In [15]:
#fill interlevels for isabela, cotabato

psgc.loc[psgc.interlevel.isnull(),"interlevel"] = u"city"

What we need is reference file that contains the higher-level administrative territories in separate columns. This is so we can create a single "master string" that we will use for matching. We'll try this instead of matching each component individually.

First, create a dictionary of the rankings of various administrative levels.

In [16]:
adm_rank= {u'reg':1,u'prov':2,u'dist':2,u'city':3,u'mun':3,u'municity':3,u'submun':3,u'bgy':4}
adm_rank_list = sorted(adm_rank, key=lambda k: adm_rank[k])
adm_rank_list
psgc['adm_rank'] = psgc.interlevel.map(adm_rank)

In [17]:
psgc.interlevel = psgc.interlevel.replace({"mun":"municity","city":"municity"})
psgc.interlevel.value_counts()

bgy         50154
municity     1811
prov           91
reg            66
submun         14
Name: interlevel, dtype: int64

We'll apply this later as a separate column.

Create a function that will add to our dataframe columns with the PSGC codes
of each location's higher level administrative territories. We'll then use this to fill the name columns 
with their corresponding place names. Recall the structure of a 9-digit PSG code and how the components correspond to different interlevels:

![PSGC structure](psgc.gif)



In [18]:
def fill_higher_level_codes(df):
        
    # Below is a dictionary of administrative hierarchy levels ranks and the stop string positions inside the PSG code.
    
    adm_rank = {1:2,
                2:4,
                3:6,
                4:9} 
    
    # Loop through each administrative level. 
    # Create additional columns for each administrative level with the suffixes _code and _name.
    # Fill each column with the names and codes of the higher level administrative territories in which
    # a place is located.
    
    for adm_level in adm_rank.keys():
        
        #create code cols        
        adm_code_col = u"adm"+str(adm_level)+u"_code"
        
        df[adm_code_col] = None
        
        #find the administrative levels that are higher than the current one
        
        higher_adm_levels = [l for l in adm_rank.keys() if l <= adm_level]
                
        for higher_level in higher_adm_levels:
            
            #higher adm level colum names
            
            higher_level_code_col = u"adm"+str(higher_level)+u"_code"
            
            #stop position of PSG code for this adm level
            
            stop_position = adm_rank[higher_level] 
            
            #derive higher level admin codes for each row
            
            codes = df.loc[df.adm_rank >= higher_level,"code"].str.slice(start=0,stop=stop_position).str.pad(9,side="right",fillchar="0").str.encode("utf-8")           
            df.loc[df.adm_rank >= higher_level,higher_level_code_col] = codes
            
            #derive higher level admin names for each row
            
            higher_level_name_col = u"adm"+str(higher_level)+u"_name"

    return df

In [19]:
psgc_unpivot = fill_higher_level_codes(psgc).dropna(how="all")
psgc_unpivot.head(10)

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,
1,10000000,ILOCOS REGION,reg,False,1,10000000,,,
2,10000000,REGION 1,reg,False,1,10000000,,,
3,10000000,REGION I,reg,False,1,10000000,,,
4,12800000,ILOCOS NORTE,prov,True,2,10000000,12800000.0,,
5,12801000,ADAMS,municity,True,3,10000000,12800000.0,12801000.0,
6,12801001,ADAMS POB.,bgy,True,4,10000000,12800000.0,12801000.0,12801001.0
7,12801001,ADAMS,bgy,False,4,10000000,12800000.0,12801000.0,12801001.0
8,12802000,BACARRA,municity,True,3,10000000,12800000.0,12802000.0,
9,12802001,BANI,bgy,True,4,10000000,12800000.0,12802000.0,12802001.0


In [20]:
all_loc_names = psgc_unpivot[['code','location']].rename(columns={'code':'join_code'})
all_loc_names.head()

Unnamed: 0,join_code,location
0,10000000,REGION I ILOCOS REGION
1,10000000,ILOCOS REGION
2,10000000,REGION 1
3,10000000,REGION I
4,12800000,ILOCOS NORTE


Add Region names.

In [21]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm1_name'}),how="left",left_on="adm1_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION


Add Prov names.

In [22]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm2_name'}),how="left",left_on="adm2_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm2_name
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,


Add MuniCity names.

In [23]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm3_name'}),how="left",left_on="adm3_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm2_name,adm3_name
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,,


Add Barangay names.

In [24]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm4_name'}),how="left",left_on="adm4_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm2_name,adm3_name,adm4_name
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,,,


In [25]:
psgc_unpivot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 374140 entries, 0 to 374139
Data columns (total 13 columns):
code          374140 non-null object
location      374140 non-null object
interlevel    374140 non-null object
original      374140 non-null bool
adm_rank      374140 non-null int64
adm1_code     374140 non-null object
adm2_code     373868 non-null object
adm3_code     373366 non-null object
adm4_code     363170 non-null object
adm1_name     374140 non-null object
adm2_name     367738 non-null object
adm3_name     373366 non-null object
adm4_name     363170 non-null object
dtypes: bool(1), int64(1), object(11)
memory usage: 37.5+ MB


In [26]:
#special handling for isabela! it's supposed to be in the province of basilan

psgc_unpivot.loc[psgc_unpivot.code.isin(["099700000","099701000"]),"adm2_name"] = "BASILAN"

#Also, isabela is the only place with two PSGC codes -- one for province level and one for city level! lets just use one.

psgc_unpivot = psgc_unpivot[psgc_unpivot.code <> "099700000"]

Create a "location tuple" that concatenates all the location components names into a single tuple. We'll use this for fuzzy matching later.

In [27]:
def normalize_text(item):
    replacements = {"barangay|brgy":"bgy",
                   "[^a-zA-Z0-9_\s]":"",
                   "poblacion":"pob",
                   "ñ":"n"}
    
    for k, v in replacements.items():
        item = re.sub(k,v,item.lower())
    return item

In [28]:
psgc_unpivot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 374136 entries, 0 to 374139
Data columns (total 13 columns):
code          374136 non-null object
location      374136 non-null object
interlevel    374136 non-null object
original      374136 non-null bool
adm_rank      374136 non-null int64
adm1_code     374136 non-null object
adm2_code     373864 non-null object
adm3_code     373362 non-null object
adm4_code     363170 non-null object
adm1_name     374136 non-null object
adm2_name     367734 non-null object
adm3_name     373362 non-null object
adm4_name     363170 non-null object
dtypes: bool(1), int64(1), object(11)
memory usage: 37.5+ MB


In [29]:
def create_loc_tuple_with_code(row):
    if row.interlevel == "reg": #For now, disregard making location tuples for all regions 
        return None 
    
    if row.code[:2] == "13": #For places inside NCR include the region (Metro Manila) in the loc tuple
        return tuple([normalize_text(v) for v in [row.adm4_name,row.adm3_name,row.adm2_name,row.adm1_name,row.interlevel,row.code] if (v is not None) and (v is not np.nan)])
    
    #else, exclude region from final tuple   
    return tuple([normalize_text(v) for v in [row.adm4_name,row.adm3_name,row.adm2_name,row.interlevel,row.code] if (v is not None) and (v is not np.nan)])

In [30]:
psgc_unpivot.loc[:,'loc_tuple'] = psgc_unpivot.apply(create_loc_tuple_with_code,axis=1)
psgc_unpivot.head(10)

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm2_name,adm3_name,adm4_name,loc_tuple
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,,,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,,,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,,,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,,,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,,,,
5,10000000,ILOCOS REGION,reg,False,1,10000000,,,,ILOCOS REGION,,,,
6,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION 1,,,,
7,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I,,,,
8,10000000,REGION 1,reg,False,1,10000000,,,,REGION I ILOCOS REGION,,,,
9,10000000,REGION 1,reg,False,1,10000000,,,,ILOCOS REGION,,,,


In [31]:
psgc_unpivot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 374136 entries, 0 to 374139
Data columns (total 14 columns):
code          374136 non-null object
location      374136 non-null object
interlevel    374136 non-null object
original      374136 non-null bool
adm_rank      374136 non-null int64
adm1_code     374136 non-null object
adm2_code     373864 non-null object
adm3_code     373362 non-null object
adm4_code     363170 non-null object
adm1_name     374136 non-null object
adm2_name     367734 non-null object
adm3_name     373362 non-null object
adm4_name     363170 non-null object
loc_tuple     373864 non-null object
dtypes: bool(1), int64(1), object(12)
memory usage: 40.3+ MB


## Create canonical names

In [32]:
canonical_names = psgc_unpivot.drop_duplicates(subset=["code"],keep="first")[['code','adm4_name',"adm3_name","adm2_name","adm1_name"]]
canonical_names = canonical_names.rename(columns={'adm4_name':'bgy','adm3_name':'municity','adm2_name':'prov','adm1_name':'reg'}).set_index('code')
len(canonical_names)

43791

In [33]:
canonical_names

Unnamed: 0_level_0,bgy,municity,prov,reg
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
010000000,,,,REGION I ILOCOS REGION
012800000,,,ILOCOS NORTE,REGION I ILOCOS REGION
012801000,,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION
012801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION
012802000,,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION
012802001,BANI,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION
012802002,BUYON,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION
012802003,CABARUAN,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION
012802004,CABULALAAN,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION
012802005,CABUSLIGAN,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION


In [34]:
psgc_locations = psgc_unpivot[['loc_tuple','code']].dropna().drop_duplicates(keep="first")

psgc_locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72483 entries, 16 to 372015
Data columns (total 2 columns):
loc_tuple    72483 non-null object
code         72483 non-null object
dtypes: object(2)
memory usage: 1.7+ MB


In [35]:
psgc_locations = psgc_locations.merge(canonical_names,left_on="code",right_index=True,how="left")

psgc_locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72483 entries, 16 to 372015
Data columns (total 6 columns):
loc_tuple    72483 non-null object
code         72483 non-null object
bgy          70212 non-null object
municity     72392 non-null object
prov         67633 non-null object
reg          72483 non-null object
dtypes: object(6)
memory usage: 3.9+ MB


In [36]:
def to_string(row):
    return ",".join(row.loc_tuple)

In [37]:
psgc_locations['loc_tuple'] = psgc_locations.apply(to_string,axis=1)

In [38]:
psgc_locations['candidate_terms'] = psgc_locations['loc_tuple'].str.rsplit(',',n=1).str.get(0)

In [39]:
psgc_locations = psgc_locations.set_index('loc_tuple')
psgc_locations.head()

Unnamed: 0_level_0,code,bgy,municity,prov,reg,candidate_terms
loc_tuple,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"ilocos norte,prov,012800000",12800000,,,ILOCOS NORTE,REGION I ILOCOS REGION,"ilocos norte,prov"
"adams,ilocos norte,municity,012801000",12801000,,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION,"adams,ilocos norte,municity"
"adams pob,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION,"adams pob,adams,ilocos norte,bgy"
"adams,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION,"adams,adams,ilocos norte,bgy"
"bacarra,ilocos norte,municity,012802000",12802000,,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION,"bacarra,ilocos norte,municity"


In [40]:
#temporarily fill the province one with metro manila names for now. huhu hehe

psgc_locations.loc[psgc_locations.code.str.startswith("13"),"prov"] = psgc_locations[psgc_locations.code.str.startswith("13")].prov.fillna(psgc_locations[psgc_locations.code.str.startswith("13")].reg)

In [41]:
#psgc_locations.to_csv('psgc-locations.csv.gz',compression="gzip")