# Cleaning the PSGC Dataset

In [1]:
import pandas as pd
import re

In [2]:
psgc = pd.read_csv('psgc.csv.gz', dtype={'Code': str})

In [3]:
psgc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43796 entries, 0 to 43795
Data columns (total 8 columns):
Code                                 43796 non-null object
Name                                 43796 non-null object
Inter-Level                          43794 non-null object
City Class                           145 non-null object
Income
Classification                1715 non-null object
Urban / Rural (based on 2010 CPH)    42046 non-null object
POPULATION
(2015 POPCEN)             43794 non-null object
Unnamed: 7                           7 non-null object
dtypes: object(8)
memory usage: 2.7+ MB


Drop unused columns:

In [4]:
psgc = psgc.drop(columns=['City Class', 'Income\nClassification', 'Urban / Rural (based on 2010 CPH)', 'POPULATION\n(2015 POPCEN)', 'Unnamed: 7'])

In [5]:
psgc['code'] = psgc.pop('Code')
psgc['location'] = psgc.pop('Name').str.strip()
psgc['interlevel'] = psgc.pop('Inter-Level').str.strip()

In [6]:
psgc['interlevel'].value_counts()

Bgy       42044
Mun        1489
City        145
Prov         81
Reg          17
SubMun       14
Dist          4
Name: interlevel, dtype: int64

In [7]:
psgc

Unnamed: 0,code,location,interlevel
0,010000000,REGION I (ILOCOS REGION),Reg
1,012800000,ILOCOS NORTE,Prov
2,012801000,ADAMS,Mun
3,012801001,Adams (Pob.),Bgy
4,012802000,BACARRA,Mun
5,012802001,Bani,Bgy
6,012802002,Buyon,Bgy
7,012802003,Cabaruan,Bgy
8,012802004,Cabulalaan,Bgy
9,012802005,Cabusligan,Bgy


## Capitalize the location field

In [8]:
psgc['location'] = psgc['location'].str.upper()

## Create a duplicate of the original PSGC dataframe

In [9]:
og_psgc = psgc.copy()

## Helpers

In [10]:
pat_expand_location = re.compile('(.+)\(+([^\(\)]+)\)*')  # ¯\_(ツ)_/¯ 
def expand_location(row):
    '''
    Extracts location names inside parens and expands it
    to the new column `alt_location`
    '''
    m = pat_expand_location.search(row['location'])
    if m:
        location, alt_location = m.groups()
        return {
            **row,
            'location': location.strip(),
            'alt_location': alt_location.strip(),
        }
    return row

## Clean regions

In [11]:
regions = psgc[psgc['interlevel'] == 'Reg'].copy()

In [12]:
regions

Unnamed: 0,code,location,interlevel
0,10000000,REGION I (ILOCOS REGION),Reg
3397,20000000,REGION II (CAGAYAN VALLEY),Reg
5807,30000000,REGION III (CENTRAL LUZON),Reg
9047,40000000,REGION IV-A (CALABARZON),Reg
13213,170000000,MIMAROPA REGION,Reg
14752,50000000,REGION V (BICOL REGION),Reg
18344,60000000,REGION VI (WESTERN VISAYAS),Reg
22535,70000000,REGION VII (CENTRAL VISAYAS),Reg
25675,80000000,REGION VIII (EASTERN VISAYAS),Reg
30215,90000000,REGION IX (ZAMBOANGA PENINSULA),Reg


Alternate names inside parens so we expand those out to a new column named `alt_location`.

In [13]:
regions = regions.apply(expand_location, axis=1, result_type='expand').fillna('')

In [14]:
regions

Unnamed: 0,alt_location,code,interlevel,location
0,ILOCOS REGION,10000000,Reg,REGION I
3397,CAGAYAN VALLEY,20000000,Reg,REGION II
5807,CENTRAL LUZON,30000000,Reg,REGION III
9047,CALABARZON,40000000,Reg,REGION IV-A
13213,,170000000,Reg,MIMAROPA REGION
14752,BICOL REGION,50000000,Reg,REGION V
18344,WESTERN VISAYAS,60000000,Reg,REGION VI
22535,CENTRAL VISAYAS,70000000,Reg,REGION VII
25675,EASTERN VISAYAS,80000000,Reg,REGION VIII
30215,ZAMBOANGA PENINSULA,90000000,Reg,REGION IX


## Clean provinces

In [15]:
provinces = psgc[psgc['interlevel'] == 'Prov'].copy()

In [16]:
provinces

Unnamed: 0,code,location,interlevel
1,012800000,ILOCOS NORTE,Prov
584,012900000,ILOCOS SUR,Prov
1387,013300000,LA UNION,Prov
1984,015500000,PANGASINAN,Prov
3398,020900000,BATANES,Prov
3434,021500000,CAGAYAN,Prov
4284,023100000,ISABELA,Prov
5377,025000000,NUEVA VIZCAYA,Prov
5668,025700000,QUIRINO,Prov
5808,030800000,BATAAN,Prov


Seems normal... But let's check for parens just in case:

In [17]:
provinces[provinces['location'].str.contains('[\(\)]')]

Unnamed: 0,code,location,interlevel
28576,86000000,SAMAR (WESTERN SAMAR),Prov
35535,124700000,COTABATO (NORTH COTABATO),Prov


Sneaky alternate names!

In [18]:
provinces = provinces.apply(expand_location, axis=1, result_type='expand').fillna('')

In [19]:
provinces

Unnamed: 0,alt_location,code,interlevel,location
1,,012800000,Prov,ILOCOS NORTE
584,,012900000,Prov,ILOCOS SUR
1387,,013300000,Prov,LA UNION
1984,,015500000,Prov,PANGASINAN
3398,,020900000,Prov,BATANES
3434,,021500000,Prov,CAGAYAN
4284,,023100000,Prov,ISABELA
5377,,025000000,Prov,NUEVA VIZCAYA
5668,,025700000,Prov,QUIRINO
5808,,030800000,Prov,BATAAN


## Clean districts

In [20]:
districts = psgc[psgc['interlevel'] == 'Dist'].copy()

In [21]:
districts

Unnamed: 0,code,location,interlevel
36786,133900000,"NCR, CITY OF MANILA, FIRST DISTRICT (NOT A PRO...",Dist
37699,137400000,"NCR, SECOND DISTRICT (NOT A PROVINCE)",Dist
37941,137500000,"NCR, THIRD DISTRICT (NOT A PROVINCE)",Dist
38206,137600000,"NCR, FOURTH DISTRICT (NOT A PROVINCE)",Dist


No one writes `NTH DISTRICT (Not a Province)` in their addresses...

In [22]:
districts['location'] = (districts['location']
                         .str.replace(',.+DISTRICT \(Not a Province\)', '')
                         .str.strip())

In [23]:
districts

Unnamed: 0,code,location,interlevel
36786,133900000,"NCR, CITY OF MANILA, FIRST DISTRICT (NOT A PRO...",Dist
37699,137400000,"NCR, SECOND DISTRICT (NOT A PROVINCE)",Dist
37941,137500000,"NCR, THIRD DISTRICT (NOT A PROVINCE)",Dist
38206,137600000,"NCR, FOURTH DISTRICT (NOT A PROVINCE)",Dist


## Clean municipalities

In [24]:
municipalities = psgc[psgc['interlevel'] == 'Mun'].copy()

In [25]:
municipalities

Unnamed: 0,code,location,interlevel
2,012801000,ADAMS,Mun
4,012802000,BACARRA,Mun
48,012803000,BADOC,Mun
80,012804000,BANGUI,Mun
139,012806000,BURGOS,Mun
151,012807000,CARASI,Mun
155,012808000,CURRIMAO,Mun
179,012809000,DINGRAS,Mun
211,012810000,DUMALNEG,Mun
216,012811000,BANNA (ESPIRITU),Mun


A few alternate names but what are those `(Capital)` ones?

In [26]:
municipalities[municipalities['location'].str.contains('[\(\)]')]

Unnamed: 0,code,location,interlevel
216,012811000,BANNA (ESPIRITU),Mun
806,012910000,GREGORIO DEL PILAR (CONCEPCION),Mun
905,012915000,QUIRINO (ANGKAKI),Mun
915,012916000,SALCEDO (BAUGEN),Mun
973,012920000,SAN JUAN (LAPOG),Mun
2514,015522000,LINGAYEN (CAPITAL),Mun
3399,020901000,BASCO (CAPITAL),Mun
4130,021526000,SANTO NIÑO (FAIRE),Mun
4870,023118000,DELFIN ALBANO (MAGSAYSAY),Mun
5454,025005000,BAYOMBONG (CAPITAL),Mun


Apparently, they are capitals of their provinces. Safe to strip!

In [27]:
municipalities['location'] = municipalities['location'].str.replace('\(Capital\)', '').str.strip()

In [28]:
municipalities = municipalities.apply(expand_location, axis=1, result_type='expand').fillna('')

In [29]:
municipalities

Unnamed: 0,alt_location,code,interlevel,location
2,,012801000,Mun,ADAMS
4,,012802000,Mun,BACARRA
48,,012803000,Mun,BADOC
80,,012804000,Mun,BANGUI
139,,012806000,Mun,BURGOS
151,,012807000,Mun,CARASI
155,,012808000,Mun,CURRIMAO
179,,012809000,Mun,DINGRAS
211,,012810000,Mun,DUMALNEG
216,ESPIRITU,012811000,Mun,BANNA


## Clean cities

In [30]:
cities = psgc[psgc['interlevel'] == 'City'].copy()

In [31]:
cities

Unnamed: 0,code,location,interlevel
95,012805000,CITY OF BATAC,City
237,012812000,CITY OF LAOAG (CAPITAL),City
706,012906000,CITY OF CANDON,City
1347,012934000,CITY OF VIGAN (CAPITAL),City
1792,013314000,CITY OF SAN FERNANDO (CAPITAL),City
2020,015503000,CITY OF ALAMINOS,City
2438,015518000,CITY OF DAGUPAN,City
2887,015532000,CITY OF SAN CARLOS,City
3317,015546000,CITY OF URDANETA,City
4234,021529000,TUGUEGARAO CITY (CAPITAL),City


Here we go with the `(Capital)` thing again.

In [32]:
cities['location'] =  cities['location'].str.replace('\(Capital\)', '').str.strip()

Checking if there are still stuff with parens:

In [33]:
cities[cities['location'].str.contains('[\(\)]')].head()

Unnamed: 0,code,location,interlevel
237,12812000,CITY OF LAOAG (CAPITAL),City
1347,12934000,CITY OF VIGAN (CAPITAL),City
1792,13314000,CITY OF SAN FERNANDO (CAPITAL),City
4234,21529000,TUGUEGARAO CITY (CAPITAL),City
4704,23114000,CITY OF ILAGAN (CAPITAL),City


A few alterate names!

In [34]:
cities = cities.apply(expand_location, axis=1, result_type='expand').fillna('')

Now what about those `CITY` pre/suffixes?

In [35]:
cities[cities['location'].str.contains('CITY')]

Unnamed: 0,alt_location,code,interlevel,location
95,,012805000,City,CITY OF BATAC
237,CAPITAL,012812000,City,CITY OF LAOAG
706,,012906000,City,CITY OF CANDON
1347,CAPITAL,012934000,City,CITY OF VIGAN
1792,CAPITAL,013314000,City,CITY OF SAN FERNANDO
2020,,015503000,City,CITY OF ALAMINOS
2438,,015518000,City,CITY OF DAGUPAN
2887,,015532000,City,CITY OF SAN CARLOS
3317,,015546000,City,CITY OF URDANETA
4234,CAPITAL,021529000,City,TUGUEGARAO CITY


In [36]:
cities['location'] = (cities['location']
 .str.replace('^.*CITY OF', '')
 .str.strip()
 .str.replace('CITY$', '')
 .str.strip())

In [37]:
cities

Unnamed: 0,alt_location,code,interlevel,location
95,,012805000,City,BATAC
237,CAPITAL,012812000,City,LAOAG
706,,012906000,City,CANDON
1347,CAPITAL,012934000,City,VIGAN
1792,CAPITAL,013314000,City,SAN FERNANDO
2020,,015503000,City,ALAMINOS
2438,,015518000,City,DAGUPAN
2887,,015532000,City,SAN CARLOS
3317,,015546000,City,URDANETA
4234,CAPITAL,021529000,City,TUGUEGARAO


## Clean sub-municipalities

In [38]:
sub_municipalities = psgc[psgc['interlevel'] == 'SubMun'].copy()

In [39]:
sub_municipalities

Unnamed: 0,code,location,interlevel
36788,133901000,TONDO I/II,SubMun
37048,133902000,BINONDO,SubMun
37059,133903000,QUIAPO,SubMun
37076,133904000,SAN NICOLAS,SubMun
37092,133905000,SANTA CRUZ,SubMun
37175,133906000,SAMPALOC,SubMun
37419,133907000,SAN MIGUEL,SubMun
37432,133908000,ERMITA,SubMun
37446,133909000,INTRAMUROS,SubMun
37452,133910000,MALATE,SubMun


Nothing special!

## Clean barangays

In [40]:
barangays = psgc[psgc['interlevel'] == 'Bgy'].copy()

In [41]:
barangays

Unnamed: 0,code,location,interlevel
3,012801001,ADAMS (POB.),Bgy
5,012802001,BANI,Bgy
6,012802002,BUYON,Bgy
7,012802003,CABARUAN,Bgy
8,012802004,CABULALAAN,Bgy
9,012802005,CABUSLIGAN,Bgy
10,012802006,CADARATAN,Bgy
11,012802007,CALIOET-LIBONG,Bgy
12,012802008,CASILIAN,Bgy
13,012802009,COROCOR,Bgy


We see alternate names again but notice the `(Pob.)` suffixes. A quick Google search shows that it's short for `Poblacion` which is used to denote the commercial and industrial center of a city.

In [42]:
barangays['location'] = (barangays['location']
                         .str.replace('\(?POB\.\)?', '')
                         .str.strip())

Let's check for more weird characters:

In [43]:
barangays[barangays['location'].str.contains('[^A-ZÑ0-9\-.\/\(\) ]')]

Unnamed: 0,code,location,interlevel
238,012812001,"BGY. NO. 42, APAYA",Bgy
239,012812002,"BGY. NO. 36, ARANIW",Bgy
240,012812003,"BGY. NO. 56-A, BACSIL NORTH",Bgy
241,012812004,"BGY. NO. 56-B, BACSIL SOUTH",Bgy
242,012812005,"BGY. NO. 41, BALACAD",Bgy
243,012812006,"BGY. NO. 40, BALATONG",Bgy
244,012812007,"BGY. NO. 55-A, BARIT-PANDAN",Bgy
245,012812008,"BGY. NO. 47, BENGCAG",Bgy
246,012812009,"BGY. NO. 50, BUTTONG",Bgy
247,012812010,"BGY. NO. 60-A, CAAOACAN",Bgy


Aside for alternate names, there are those starting with `BGY. NO. X,`:

In [44]:
barangays[barangays['location'].str.contains('^B[GR]Y. NO.')]

Unnamed: 0,code,location,interlevel
238,012812001,"BGY. NO. 42, APAYA",Bgy
239,012812002,"BGY. NO. 36, ARANIW",Bgy
240,012812003,"BGY. NO. 56-A, BACSIL NORTH",Bgy
241,012812004,"BGY. NO. 56-B, BACSIL SOUTH",Bgy
242,012812005,"BGY. NO. 41, BALACAD",Bgy
243,012812006,"BGY. NO. 40, BALATONG",Bgy
244,012812007,"BGY. NO. 55-A, BARIT-PANDAN",Bgy
245,012812008,"BGY. NO. 47, BENGCAG",Bgy
246,012812009,"BGY. NO. 50, BUTTONG",Bgy
247,012812010,"BGY. NO. 60-A, CAAOACAN",Bgy


Let's set the `BGY. NO. X` bit as `alt_location`:

In [45]:
pat_expand_bgy_location = re.compile('(B[GR]Y. NO. \d+\-?\w?),? (.+)')
def expand_bgy_location(row):
    m = pat_expand_bgy_location.search(row['location'])
    if m:
        alt_location, location = m.groups()
        return {
            **row,
            'location': location.strip(),
            'alt_location': alt_location.strip(),
        }
    return expand_location(row)
barangays = barangays.apply(expand_bgy_location, axis=1, result_type='expand').fillna('')

Check if we got all (there should be 80):

In [46]:
barangays[barangays['alt_location'].str.contains('B[GR]Y. NO.')]

Unnamed: 0,alt_location,code,interlevel,location
238,BGY. NO. 42,012812001,Bgy,APAYA
239,BGY. NO. 36,012812002,Bgy,ARANIW
240,BGY. NO. 56-A,012812003,Bgy,BACSIL NORTH
241,BGY. NO. 56-B,012812004,Bgy,BACSIL SOUTH
242,BGY. NO. 41,012812005,Bgy,BALACAD
243,BGY. NO. 40,012812006,Bgy,BALATONG
244,BGY. NO. 55-A,012812007,Bgy,BARIT-PANDAN
245,BGY. NO. 47,012812008,Bgy,BENGCAG
246,BGY. NO. 50,012812009,Bgy,BUTTONG
247,BGY. NO. 60-A,012812010,Bgy,CAAOACAN


Another check for weird stuff:

In [47]:
barangays[barangays['location'].str.contains('[^A-ZÑ0-9 \-.,\/]')]

Unnamed: 0,alt_location,code,interlevel,location
3649,VALLEY COVE*,21506049,Bgy,C. VERZOSA (
4723,VILLA MARCOS*,23114025,Bgy,CABESERIA 6 & 24
5934,,30806003,Bgy,KITANG 2 & LUZ
6177,,31407014,Bgy,IBA O'ESTE
6859,,34906018,Bgy,DISTRICT I I)
6860,,34906019,Bgy,DISTRICT II II)
6861,,34906021,Bgy,DISTRICT IV IV)
6862,,34906022,Bgy,DISTRICT V V)
6863,,34906023,Bgy,DISTRICT VI VI)
6864,,34906024,Bgy,DISTRICT VII VII)


Trim weird stuff:

In [48]:
barangays['location'] = barangays['location'].str.strip('[\*\(\)]')
barangays['alt_location'] = barangays['alt_location'].str.strip('[\*\(\)]')

Last check!

In [49]:
barangays[barangays['location'].str.contains('[^A-ZÑ0-9 \-.,\/]')]

Unnamed: 0,alt_location,code,interlevel,location
4723,VILLA MARCOS,23114025,Bgy,CABESERIA 6 & 24
5934,,30806003,Bgy,KITANG 2 & LUZ
6177,,31407014,Bgy,IBA O'ESTE
7030,,34911066,Bgy,TAMPAC II & III
8204,,36904013,Bgy,O'DONNELL
10758,,42115008,Bgy,PALANGUE 2 & 3
14991,,50506005,Bgy,BGY. 1 - EM'S BARRIO
15000,,50506016,Bgy,BGY. 2 - EM'S BARRIO SOUTH
15011,,50506027,Bgy,BGY. 3 - EM'S BARRIO EAST
31864,RUIZ,98305038,Bgy,VETERAN'S VILLAGE


## ARMM: Cotabato and Isabela City

In [50]:
armm = psgc[psgc['interlevel'].isnull()].copy()
armm

Unnamed: 0,code,location,interlevel
32149,99700000,CITY OF ISABELA (NOT A PROVINCE),
36746,129800000,COTABATO CITY (NOT A PROVINCE),


In [51]:
armm['location'] = armm['location'].str.replace('\(Not a Province\)', '')
armm

Unnamed: 0,code,location,interlevel
32149,99700000,CITY OF ISABELA (NOT A PROVINCE),
36746,129800000,COTABATO CITY (NOT A PROVINCE),


In [52]:
armm['location'] = (armm['location']
 .str.replace('^.*CITY OF', '')
 .str.strip()
 .str.replace('CITY$', '')
 .str.strip())
armm

Unnamed: 0,code,location,interlevel
32149,99700000,ISABELA (NOT A PROVINCE),
36746,129800000,COTABATO CITY (NOT A PROVINCE),


## All together now

In [53]:
merged = pd.concat([
    regions,
    provinces,
    districts,
    municipalities,
    cities,
    sub_municipalities,
    barangays,
    armm
], sort=True).sort_index().fillna('')

Are counts still correct?

In [54]:
psgc['interlevel'].value_counts()

Bgy       42044
Mun        1489
City        145
Prov         81
Reg          17
SubMun       14
Dist          4
Name: interlevel, dtype: int64

In [55]:
merged['interlevel'].value_counts()

Bgy       42044
Mun        1489
City        145
Prov         81
Reg          17
SubMun       14
Dist          4
              2
Name: interlevel, dtype: int64

In [56]:
display(len(merged), len(psgc))

43796

43796

Extract alternate locations into a new dataframe:

In [57]:
alt = merged[merged['alt_location'] != ''].copy()
alt['location'] = alt.pop('alt_location')

In [58]:
alt

Unnamed: 0,code,interlevel,location
0,010000000,Reg,ILOCOS REGION
49,012803001,Bgy,ALAY 15-B
70,012803027,Bgy,PAGUETPET
112,012805018,Bgy,CAOAYAN
114,012805020,Bgy,CUBOL
116,012805022,Bgy,ILOILO
117,012805023,Bgy,LABUCAO
124,012805030,Bgy,NALASIN
144,012806007,Bgy,MALITUEK
183,012809004,Bgy,BANGAY


Concat alternate locations and do final cleanup on location name:

In [59]:
clean_psgc = (pd.concat([merged.drop(columns=['alt_location']), alt], ignore_index=True)
              .sort_values('code')
              .reset_index(drop=True))

Normalize `Ñ` and remove remaining `*`s:

In [60]:
clean_psgc['location'] = (clean_psgc['location']
                          .str.replace('Ñ', 'N')
                          .str.replace('\*', ''))

Normalize numbers:

In [61]:
spanish = clean_psgc[clean_psgc['location'].str.contains(' (UNO|DOS|TRES|KUATRO|SINGKO)$')].copy()

  """Entry point for launching an IPython kernel.


In [62]:
for i, s in enumerate([
    'UNO',
    'DOS',
    'TRES',
    'KUATRO',
    'SINGKO',
]):
    spanish['location'] = spanish['location'].str.replace(' {}$'.format(s), ' {}'.format(i + 1))
spanish

Unnamed: 0,code,interlevel,location
9687,41006023,Bgy,MANGHINAO 1
9695,41006032,Bgy,SAN ANDRES 1
11656,43404017,Bgy,BARANGAY 1
11657,43404018,Bgy,BARANGAY 2
11658,43404019,Bgy,BARANGAY 3
29675,86415013,Bgy,ESPERANZA 2
30024,97203016,Bgy,BARANGAY 1
30025,97203017,Bgy,BARANGAY 2
30694,97302017,Bgy,CAMPO 1
30768,97305024,Bgy,SUGBAY 1


In [63]:
roman = clean_psgc[clean_psgc['location'].str.contains('\s(X{0,3})(IX|IV|V?I{0,3})$')].copy()

  """Entry point for launching an IPython kernel.


In [64]:
for i, s in enumerate('I,II,III,IV,V,VI,VII,VIII,IX,X,XI,XII,XIII,XIV,XV,XVI,XVII,XVIII,XIX,XX,XXI,XXII'.split(',')):
    roman['location'] = roman['location'].str.replace(' {}$'.format(s), ' {}'.format(i + 1))
roman

Unnamed: 0,code,interlevel,location
0,010000000,Reg,REGION 1
27,012802024,Bgy,SAN AGUSTIN 1
28,012802025,Bgy,SAN AGUSTIN 2
29,012802027,Bgy,SAN ANDRES 1
30,012802028,Bgy,SAN ANDRES 2
31,012802030,Bgy,SAN GABRIEL 1
32,012802031,Bgy,SAN GABRIEL 2
33,012802033,Bgy,SAN PEDRO 1
34,012802034,Bgy,SAN PEDRO 2
35,012802036,Bgy,SAN ROQUE 1


Provide alternate names for locations with President names

In [65]:
president = clean_psgc[clean_psgc.location.str.contains('PRES\.', flags=re.IGNORECASE)].copy()
president['location'] = president['location'].str.replace('^PRES\.', 'PRESIDENT')

# Add alternative names to the Manila districts

In [66]:
alt_ncr_as_manila = clean_psgc[clean_psgc.interlevel == 'Dist'].copy()
alt_ncr_as_manila['location'] = alt_ncr_as_manila['location'].str.replace('NCR', 'MANILA')

alt_ncr_abbvr_expanded = clean_psgc[clean_psgc.interlevel == 'Dist'].copy()
alt_ncr_abbvr_expanded['location'] = alt_ncr_abbvr_expanded['location'].str.replace('NCR', 'NATIONAL CAPITAL REGION')

alt_districts = pd.concat([alt_ncr_as_manila, alt_ncr_abbvr_expanded], ignore_index=True)
alt_districts

Unnamed: 0,code,interlevel,location
0,133900000,Dist,"MANILA, CITY OF MANILA, FIRST DISTRICT (NOT A ..."
1,137400000,Dist,"MANILA, SECOND DISTRICT (NOT A PROVINCE)"
2,137500000,Dist,"MANILA, THIRD DISTRICT (NOT A PROVINCE)"
3,137600000,Dist,"MANILA, FOURTH DISTRICT (NOT A PROVINCE)"
4,133900000,Dist,"NATIONAL CAPITAL REGION, CITY OF MANILA, FIRST..."
5,137400000,Dist,"NATIONAL CAPITAL REGION, SECOND DISTRICT (NOT ..."
6,137500000,Dist,"NATIONAL CAPITAL REGION, THIRD DISTRICT (NOT A..."
7,137600000,Dist,"NATIONAL CAPITAL REGION, FOURTH DISTRICT (NOT ..."


# Concat the alternates to the main dataframe

In [67]:
clean_psgc = (pd.concat([clean_psgc, spanish, roman, president, alt_districts], ignore_index=True)
              .sort_values('code')
              .reset_index(drop=True))

Last check for weird stuff!

In [68]:
clean_psgc[clean_psgc['location'].str.contains('[^A-Z0-9 \-.,\']')]

Unnamed: 0,code,interlevel,location
4036,21511032,Bgy,CAMALAGGOAN/D LEANO
4046,21511037,Bgy,SAYAD/BIMEKEL
5078,23114025,Bgy,CABESERIA 6 & 24
5637,23133007,Bgy,CADDANGAN/LIMBAUAN
6381,30806003,Bgy,KITANG 2 & LUZ
7417,34907000,Mun,BITULOK & SABANI
7562,34911066,Bgy,TAMPAC II & III
7563,34911066,Bgy,TAMPAC II & 3
8871,36905054,Bgy,CALUIS/COBRA
11845,42115008,Bgy,PALANGUE 2 & 3


We can probably still split with `&` and `/` but this is good enough for now.

## Combine the cleaned up PSGC and remove the duplicates

In [69]:
og_psgc['original'] = True
clean_psgc['original'] = False

clean_psgc = pd.concat([og_psgc, clean_psgc], sort=False)
clean_psgc.drop_duplicates(subset=['code', 'location', 'interlevel'], inplace=True)
clean_psgc.sort_values('code', inplace=True)

Check that we have both the original name and the alternate ones

In [70]:
clean_psgc[clean_psgc.code.str.contains('086000000')]

Unnamed: 0,code,location,interlevel,original
28576,86000000,SAMAR (WESTERN SAMAR),Prov,True
29327,86000000,SAMAR,Prov,False
29326,86000000,WESTERN SAMAR,Prov,False


In [71]:
clean_psgc.to_csv('clean-psgc.csv.gz', index=False, compression='gzip')

And we're done!