In [158]:
import pandas as pd
import operator
import re, string
import numpy as np

# Prepare Philippine Standard Geographic Code Reference File

In [159]:
psgc = pd.read_csv("clean-psgc.csv.gz",dtype={'code':'object'},compression="gzip",encoding="utf-8")

In [160]:
psgc.location = psgc.location.str.encode("utf-8")

In [161]:
psgc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52214 entries, 0 to 52213
Data columns (total 4 columns):
code          52214 non-null object
location      52214 non-null object
interlevel    52210 non-null object
original      52214 non-null bool
dtypes: bool(1), object(3)
memory usage: 1.2+ MB


In [162]:
psgc.head()

Unnamed: 0,code,location,interlevel,original
0,10000000,REGION I (ILOCOS REGION),Reg,True
1,10000000,ILOCOS REGION,Reg,False
2,10000000,REGION 1,Reg,False
3,10000000,REGION I,Reg,False
4,12800000,ILOCOS NORTE,Prov,True


In [163]:
#totally drop places that are just "capital" or "not a province". I think in the previous cleanups of the PSGC file anything
#in parentheses was turned into a new row. As such, "Capital" was often turned into a new row.

psgc = psgc[psgc.location.isin(["CAPITAL","NOT A PROVINCE"])==False].reset_index(drop=True)

In [164]:
#interlevel lower
psgc.interlevel = psgc.interlevel.str.lower().str.encode("utf-8")
psgc.interlevel = psgc.interlevel.replace({"mun":"municity","city":"municity"}) #combine municity for now

#Clean location column a bit. but not too much because we'll use this as the "canonical" name
psgc['location'] = psgc.location.str.replace(r"NOT A PROVINCE|CAPITAL|\(|\)","").str.strip()
psgc = psgc.drop_duplicates(subset=["code","location","interlevel"],keep="first")

In [165]:
psgc.interlevel.value_counts()

bgy         49971
municity     1942
prov           85
reg            61
dist           20
submun         14
Name: interlevel, dtype: int64

In [166]:
# rename districts as simply "Metro Manila", "Metropolitan Manila", "National Capital Region" or "NCR"

ncr = psgc[(psgc.code == "130000000")]
ncr.loc[:,"location"] = ncr.location.str.replace("NATIONAL  REGION","NATIONAL CAPITAL REGION")
ncr.loc[:,"location"] = ncr.location.str.replace("NATIONAL  REGION","NATIONAL CAPITAL REGION")
ncr = ncr.append(pd.Series({"code":"130000000","location":"METRO MANILA","interlevel":"reg","original":False}),ignore_index=True)
ncr = ncr.append(pd.Series({"code":"130000000","location":"METROPOLITAN MANILA","interlevel":"reg","original":False}),ignore_index=True)
ncr.head()

Unnamed: 0,code,location,interlevel,original
0,130000000,NATIONAL CAPITAL REGION NCR,reg,True
1,130000000,NCR,reg,False
2,130000000,NATIONAL CAPITAL REGION,reg,False
3,130000000,METRO MANILA,reg,False
4,130000000,METROPOLITAN MANILA,reg,False


In [167]:
# remove districts and replace NCR region rows from reference file for now with cleaned up NCR rows

psgc = psgc[psgc.interlevel <> 'dist'].reset_index(drop=True) #exclude districts
psgc = psgc[psgc.code <> '130000000'].reset_index(drop=True) #exclude original ncr region rows
print len(psgc)
psgc = psgc.append(ncr,ignore_index=True) #append cleaned up ncr region rows
print len(psgc)
psgc.head()
psgc[psgc.code == "130000000"]

52073
52078


Unnamed: 0,code,location,interlevel,original
52073,130000000,NATIONAL CAPITAL REGION NCR,reg,True
52074,130000000,NCR,reg,False
52075,130000000,NATIONAL CAPITAL REGION,reg,False
52076,130000000,METRO MANILA,reg,False
52077,130000000,METROPOLITAN MANILA,reg,False


In [168]:
# add as aliases abbreviations of north, south, east, west. for example, northern samar will have an alias n. samar.

nsew = re.compile(r"^NORTH(ERN)? |^SOUTH(ERN)? |^EAST(ERN?)? |^WEST(ERN)? ")
nsew_abbrev = psgc[psgc.location.str.contains(nsew)].location.str.split().str.get(0).str.slice(0,1)
nsew_abbrev.head()



7285    E
7286    E
7287    E
7288    E
7302    W
Name: location, dtype: object

In [169]:
nsew_locs = psgc[psgc.location.str.contains(nsew)]
nsew_locs.head()

  if __name__ == '__main__':


Unnamed: 0,code,location,interlevel,original
7285,30812001,EAST CALAGUIMAN,bgy,False
7286,30812001,EAST CALAGUIMAN POB.,bgy,True
7287,30812002,EAST DAANG BAGO POB.,bgy,True
7288,30812002,EAST DAANG BAGO,bgy,False
7302,30812014,WEST CALAGUIMAN POB.,bgy,True


In [170]:
nsew_locs.loc[:,'location'] = nsew_abbrev.str.cat(psgc[psgc.location.str.contains(nsew)].location.str.replace("^NORTH(ERN)? |^SOUTH(ERN)? |^EAST(ERN?)? |^WEST(ERN)? ","").str.strip(),sep=" ")
nsew_locs.loc[:,"original"] = False
nsew_locs.head()

  if __name__ == '__main__':


Unnamed: 0,code,location,interlevel,original
7285,30812001,E CALAGUIMAN,bgy,False
7286,30812001,E CALAGUIMAN POB.,bgy,False
7287,30812002,E DAANG BAGO POB.,bgy,False
7288,30812002,E DAANG BAGO,bgy,False
7302,30812014,W CALAGUIMAN POB.,bgy,False


In [171]:
psgc = pd.concat([psgc, nsew_locs],ignore_index=True)
psgc.head()

Unnamed: 0,code,location,interlevel,original
0,10000000,REGION I ILOCOS REGION,reg,True
1,10000000,ILOCOS REGION,reg,False
2,10000000,REGION 1,reg,False
3,10000000,REGION I,reg,False
4,12800000,ILOCOS NORTE,prov,True


In [172]:
#fill interlevels for isabela, cotabato

psgc.loc[psgc.interlevel.isnull(),"interlevel"] = u"municity"

What we need is reference file that contains the higher-level administrative territories in separate columns. This is so we can create a single "master string" that we will use for matching. We'll try this instead of matching each component individually.

First, create a dictionary of the rankings of various administrative levels.

In [173]:
adm_rank= {u'reg':1,u'prov':2,u'dist':2,u'city':3,u'mun':3,u'municity':3,u'submun':3,u'bgy':4}
adm_rank_list = sorted(adm_rank, key=lambda k: adm_rank[k])
adm_rank_list
psgc['adm_rank'] = psgc.interlevel.map(adm_rank)

In [174]:
psgc.interlevel.value_counts()

bgy         50155
municity     1948
prov           91
reg            66
submun         14
Name: interlevel, dtype: int64

We'll apply this later as a separate column.

Create a function that will add to our dataframe columns with the PSGC codes
of each location's higher level administrative territories. We'll then use this to fill the name columns 
with their corresponding place names.


In [175]:
def fill_higher_level_codes(df):
        
    # Below is a dictionary of administrative hierarchy levels ranks and the stop string positions inside the PSG code.
    
    adm_rank = {1:2,
                2:4,
                3:6,
                4:9} 
    
    # Loop through each administrative level. 
    # Create additional columns for each administrative level with the suffixes _code and _name.
    # Fill each column with the names and codes of the higher level administrative territories in which
    # a place is located.
    
    for adm_level in adm_rank.keys():
        
        #create code cols        
        adm_code_col = u"adm"+str(adm_level)+u"_code"
        
        df[adm_code_col] = None
        
        #find the administrative levels that are higher than the current one
        
        higher_adm_levels = [l for l in adm_rank.keys() if l <= adm_level]
                
        for higher_level in higher_adm_levels:
            
            #higher adm level colum names
            
            higher_level_code_col = u"adm"+str(higher_level)+u"_code"
            
            #stop position of PSG code for this adm level
            
            stop_position = adm_rank[higher_level] 
            
            #derive higher level admin codes for each row
            
            codes = df.loc[df.adm_rank >= higher_level,"code"].str.slice(start=0,stop=stop_position).str.pad(9,side="right",fillchar="0").str.encode("utf-8")           
            df.loc[df.adm_rank >= higher_level,higher_level_code_col] = codes
            
            #derive higher level admin names for each row
            
            higher_level_name_col = u"adm"+str(higher_level)+u"_name"

    return df

In [176]:
psgc_unpivot = fill_higher_level_codes(psgc).dropna(how="all")
psgc_unpivot.head(10)

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,
1,10000000,ILOCOS REGION,reg,False,1,10000000,,,
2,10000000,REGION 1,reg,False,1,10000000,,,
3,10000000,REGION I,reg,False,1,10000000,,,
4,12800000,ILOCOS NORTE,prov,True,2,10000000,12800000.0,,
5,12801000,ADAMS,municity,True,3,10000000,12800000.0,12801000.0,
6,12801001,ADAMS POB.,bgy,True,4,10000000,12800000.0,12801000.0,12801001.0
7,12801001,ADAMS,bgy,False,4,10000000,12800000.0,12801000.0,12801001.0
8,12802000,BACARRA,municity,True,3,10000000,12800000.0,12802000.0,
9,12802001,BANI,bgy,True,4,10000000,12800000.0,12802000.0,12802001.0


In [177]:
all_loc_names = psgc_unpivot[['code','location','original']].rename(columns={'code':'join_code'})
all_loc_names.head()

Unnamed: 0,join_code,location,original
0,10000000,REGION I ILOCOS REGION,True
1,10000000,ILOCOS REGION,False
2,10000000,REGION 1,False
3,10000000,REGION I,False
4,12800000,ILOCOS NORTE,True


Add Region names.

In [178]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm1_name','original':'adm1_is_orig'}),how="left",left_on="adm1_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm1_is_orig
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,True
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,False
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,False
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,False
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,True


Add Prov names.

In [179]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm2_name','original':'adm2_is_orig'}),how="left",left_on="adm2_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm1_is_orig,adm2_name,adm2_is_orig
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,True,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,False,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,False,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,False,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,True,,


Add MuniCity names.

In [180]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm3_name','original':'adm3_is_orig'}),how="left",left_on="adm3_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm1_is_orig,adm2_name,adm2_is_orig,adm3_name,adm3_is_orig
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,True,,,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,False,,,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,False,,,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,False,,,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,True,,,,


Add Barangay names.

In [181]:
psgc_unpivot = psgc_unpivot.merge(all_loc_names.rename(columns={'location':'adm4_name','original':'adm4_is_orig'}),how="left",left_on="adm4_code",right_on="join_code").drop('join_code',axis=1)
psgc_unpivot.head()

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm1_is_orig,adm2_name,adm2_is_orig,adm3_name,adm3_is_orig,adm4_name,adm4_is_orig
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,True,,,,,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,False,,,,,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,False,,,,,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,False,,,,,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,True,,,,,,


In [182]:
#Special handling for isabela city! it's supposed to be in the province of basilan

psgc_unpivot.loc[psgc_unpivot.code.isin(["099700000","099701000"]),"adm2_name"] = "BASILAN"

#Also, isabela is the only place with two PSGC codes -- one for province level and one for city level! lets just use one.

psgc_unpivot = psgc_unpivot[psgc_unpivot.code <> "099700000"]

Create a "location tuple" that concatenates all the location components names into a single tuple. We'll use this for fuzzy matching later.

In [183]:
#append all the rows again for places in metro manila except with blank regions. 
#this enables us to accept "Fort Bonifacio, Taguig" as an exact match even if it doesn't have "Metro Manila" in it

metro_manila = psgc_unpivot[psgc_unpivot.code.str.startswith("13")]
metro_manila.loc[:,"adm1_name"] = np.nan
print len(psgc_unpivot)
psgc_unpivot = psgc_unpivot.append(metro_manila,ignore_index=True).reset_index(drop=True)
print len(psgc_unpivot)

451465
472125


In [184]:
def normalize_text(item):
    replacements = {r"city of|city":"",
                    r"barangay|brgy":"bgy",
                    r"[^a-zA-Z0-9_\s]":"",
                    r"poblacion":"pob",
                    r"ñ":"n"}
    
    item = item.lower()
    
    if item not in ['bgy','municity','prov','reg']:
        
        for k, v in replacements.items():
            item = re.sub(k,v,item.strip())
            
    return item.strip()

In [185]:
def create_loc_tuple_with_code(row):
    
    #For now, disregard making location tuples for all regions 
    if row.interlevel == "reg": 
        return None 
    
    #Special handling for NCR:
    
    if row.code[:3] == "133" and row.interlevel <> "municity": #Manila has submunicipalities so include all admin levels
        
        return tuple([normalize_text(v) for v in [row.adm4_name,row.adm3_name,row.adm2_name,row.adm1_name,row.interlevel,row.code] if (v is not None) and (v is not np.nan)])
    
    elif (row.code[:3] == "133" and row.interlevel == "municity") or (row.code[:3] == "137"): #Except when Manila City or anywhere else in NCR is the item, then exclude adm2_name
        
        return tuple([normalize_text(v) for v in [row.adm4_name,row.adm3_name,row.adm1_name,row.interlevel,row.code] if (v is not None) and (v is not np.nan)])
        
    #else, exclude region from final tuple   
    
    return tuple([normalize_text(v) for v in [row.adm4_name,row.adm3_name,row.adm2_name,row.interlevel,row.code] if (v is not None) and (v is not np.nan)])

In [186]:
psgc_unpivot.loc[:,'loc_tuple'] = psgc_unpivot.apply(create_loc_tuple_with_code,axis=1)
psgc_unpivot.head(10)

Unnamed: 0,code,location,interlevel,original,adm_rank,adm1_code,adm2_code,adm3_code,adm4_code,adm1_name,adm1_is_orig,adm2_name,adm2_is_orig,adm3_name,adm3_is_orig,adm4_name,adm4_is_orig,loc_tuple
0,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I ILOCOS REGION,True,,,,,,,
1,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,ILOCOS REGION,False,,,,,,,
2,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION 1,False,,,,,,,
3,10000000,REGION I ILOCOS REGION,reg,True,1,10000000,,,,REGION I,False,,,,,,,
4,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I ILOCOS REGION,True,,,,,,,
5,10000000,ILOCOS REGION,reg,False,1,10000000,,,,ILOCOS REGION,False,,,,,,,
6,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION 1,False,,,,,,,
7,10000000,ILOCOS REGION,reg,False,1,10000000,,,,REGION I,False,,,,,,,
8,10000000,REGION 1,reg,False,1,10000000,,,,REGION I ILOCOS REGION,True,,,,,,,
9,10000000,REGION 1,reg,False,1,10000000,,,,ILOCOS REGION,False,,,,,,,


In [187]:
psgc_unpivot = psgc_unpivot.drop_duplicates(keep="first")
len(psgc_unpivot)

459729

## Create canonical names

In [188]:
#Find the original names for each unique PSGC code

psgc_unpivot['is_canonical'] = psgc_unpivot['adm4_is_orig'].fillna(True) & psgc_unpivot['adm3_is_orig'].fillna(True) & psgc_unpivot['adm2_is_orig'].fillna(True) & psgc_unpivot['adm1_is_orig'].fillna(True)

# get the canonical names for each PSGC

canonical_names = psgc_unpivot[psgc_unpivot.is_canonical == True]
canonical_names = canonical_names[['code','adm4_name','adm3_name','adm2_name','adm1_name']].drop_duplicates(keep="first")#.set_index('code')

#drop row for metro manila where region is blank, we don't want to use these in the canonical names

null_ncr_region = canonical_names[canonical_names.code.str.startswith("13") * canonical_names.adm1_name.isnull()].index
canonical_names = canonical_names.drop(null_ncr_region,axis=0).set_index('code')
canonical_names = canonical_names.rename(columns={"adm4_name":"bgy","adm3_name":"municity","adm2_name":"prov","adm1_name":"reg"})

In [189]:
print canonical_names.info()

canonical_names.head()

<class 'pandas.core.frame.DataFrame'>
Index: 43354 entries, 010000000 to 130000000
Data columns (total 4 columns):
bgy         41624 non-null object
municity    43256 non-null object
prov        42508 non-null object
reg         43354 non-null object
dtypes: object(4)
memory usage: 1.7+ MB
None


Unnamed: 0_level_0,bgy,municity,prov,reg
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000000,,,,REGION I ILOCOS REGION
12800000,,,ILOCOS NORTE,REGION I ILOCOS REGION
12801000,,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION
12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION
12802000,,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION


In [190]:
psgc_locations = psgc_unpivot[['loc_tuple','code']].dropna().drop_duplicates(keep="first")

psgc_locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74366 entries, 16 to 471661
Data columns (total 2 columns):
loc_tuple    74366 non-null object
code         74366 non-null object
dtypes: object(2)
memory usage: 1.7+ MB


In [191]:
psgc_locations = psgc_locations.merge(canonical_names,left_on="code",right_index=True,how="left")

psgc_locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74366 entries, 16 to 471661
Data columns (total 6 columns):
loc_tuple    74366 non-null object
code         74366 non-null object
bgy          71538 non-null object
municity     73734 non-null object
prov         68005 non-null object
reg          73825 non-null object
dtypes: object(6)
memory usage: 4.0+ MB


In [192]:
def to_string(row):
    return ",".join(row.loc_tuple)

In [193]:
psgc_locations['loc_tuple'] = psgc_locations.apply(to_string,axis=1)

In [194]:
psgc_locations.head()

Unnamed: 0,loc_tuple,code,bgy,municity,prov,reg
16,"ilocos norte,prov,012800000",12800000,,,ILOCOS NORTE,REGION I ILOCOS REGION
20,"adams,ilocos norte,municity,012801000",12801000,,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION
24,"adams pob,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION
25,"adams,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION
40,"bacarra,ilocos norte,municity,012802000",12802000,,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION


In [195]:
psgc_locations['candidate_terms'] = psgc_locations['loc_tuple'].str.rsplit(',',n=1).str.get(0)

In [196]:
psgc_locations = psgc_locations.set_index('loc_tuple')
psgc_locations.head()

Unnamed: 0_level_0,code,bgy,municity,prov,reg,candidate_terms
loc_tuple,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"ilocos norte,prov,012800000",12800000,,,ILOCOS NORTE,REGION I ILOCOS REGION,"ilocos norte,prov"
"adams,ilocos norte,municity,012801000",12801000,,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION,"adams,ilocos norte,municity"
"adams pob,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION,"adams pob,adams,ilocos norte,bgy"
"adams,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION,"adams,adams,ilocos norte,bgy"
"bacarra,ilocos norte,municity,012802000",12802000,,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION,"bacarra,ilocos norte,municity"


In [197]:
#for metro manila rows, temporarily fill with NCR for now

#psgc_locations.loc[psgc_locations.code.str.startswith("13"),"prov"] = "NATIONAL CAPITAL REGION NCR"
psgc_locations.head()

Unnamed: 0_level_0,code,bgy,municity,prov,reg,candidate_terms
loc_tuple,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"ilocos norte,prov,012800000",12800000,,,ILOCOS NORTE,REGION I ILOCOS REGION,"ilocos norte,prov"
"adams,ilocos norte,municity,012801000",12801000,,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION,"adams,ilocos norte,municity"
"adams pob,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION,"adams pob,adams,ilocos norte,bgy"
"adams,adams,ilocos norte,bgy,012801001",12801001,ADAMS POB.,ADAMS,ILOCOS NORTE,REGION I ILOCOS REGION,"adams,adams,ilocos norte,bgy"
"bacarra,ilocos norte,municity,012802000",12802000,,BACARRA,ILOCOS NORTE,REGION I ILOCOS REGION,"bacarra,ilocos norte,municity"


In [198]:
#drop region column

psgc_locations = psgc_locations.drop('reg',axis=1)

In [200]:
psgc_locations.to_csv('psgc-locations.csv.gz',compression="gzip")