# Clean up location data

Fix typos, etc...

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

In [2]:
data_dir = Path('../data').resolve()
data_dir

PosixPath('/Volumes/GoogleDrive/My Drive/covid_data')

In [3]:
patient_meta_files = sorted((data_dir / 'patient_meta').glob('*.tsv'))
print('Collecting {} patient metadata files...'.format(len(patient_meta_files)), end='', flush=True)
patient_meta_df = pd.DataFrame()
for f in patient_meta_files:
    _df = pd.read_csv(f, sep='\t', skiprows=2)
    patient_meta_df = pd.concat([patient_meta_df, _df], ignore_index=True)
    
patient_meta_df

Collecting 12 patient metadata files...

Unnamed: 0,Virus name,Accession ID,Collection date,Location,Host,Additional location information,Gender,Patient age,Patient status,Passage,Specimen,Additional host information,Lineage,Clade
0,hCoV-19/Wuhan/IVDC-HB-01/2019,EPI_ISL_402119,2019-12-30,Asia / China / Hubei / Wuhan,Human,,Female,49,,"Virus Isolate, Passage 1",Alveolar lavage fluid,,,
1,hCoV-19/Wuhan/IVDC-HB-04/2020,EPI_ISL_402120,2020-01-01,Asia / China / Hubei / Wuhan,Human,,Male,61,,Original,Alveolar lavage fluid,,,
2,hCoV-19/Wuhan/IVDC-HB-05/2019,EPI_ISL_402121,2019-12-30,Asia / China / Hubei / Wuhan,Human,,Male,32,,Original,Alveolar lavage fluid,,,
3,hCoV-19/Wuhan/IPBCAMS-WH-01/2019,EPI_ISL_402123,2019-12-24,Asia / China / Hubei / Wuhan,Human,,Male,65,,Original,Bronchoalveolar lavage fluid,,,
4,hCoV-19/Wuhan/WIV04/2019,EPI_ISL_402124,2019-12-30,Asia / China / Hubei / Wuhan,Human,,Female,49,Live,Original,Bronchoalveolar lavage fluid,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33427,hCoV-19/England/NORW-E94D4/2020,EPI_ISL_453599,2020-04-22,Europe / United Kingdom / England,Human,,unknown,unknown,unknown,Original,unknown,,B.2.2,V
33428,hCoV-19/Netherlands/Leiden-0002/2020,EPI_ISL_454416,2020-03-09,Europe / Netherlands / Leiden,Human,,unknown,unknown,unknown,Vero E6 P2,Throat swab,,B.1,G
33429,hCoV-19/Meizhou/MZ01/2020,EPI_ISL_454417,2020-02-13,Asia / China / Meizhou,Human,,unknown,unknown,unknown,unknown,Oronasopharynx,,A,S
33430,hCoV-19/Meizhou/MZ02/2020,EPI_ISL_454418,2020-02-15,Asia / China / Meizhou,Human,,unknown,unknown,unknown,unknown,Oronasopharynx,,A,S


In [40]:
# Location data is stored in one column, "region / country / division / location"
location_df = (
    patient_meta_df['Location'].str.split('/', expand=True)
    .iloc[:, :4] # Only take 4 columns
    # Rename columns
    .rename(columns={0: 'region', 1: 'country', 2: 'division', 3: 'location'})
    .applymap(lambda x: x.strip() if x else x)
    # Placeholder for missing values, so that it will still 
    # be caught by groupby() later on
    .fillna(-1)
)
# Re-add metadata columns
location_df['name'] = patient_meta_df['Virus name']
location_df['gisaid_id'] = patient_meta_df['Accession ID']
location_df['sample_date'] = patient_meta_df['Collection date']

# Convert sample_date to datetime
location_df['sample_date'] = pd.to_datetime(location_df['sample_date'], yearfirst=True)

location_df

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
0,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/IVDC-HB-01/2019,EPI_ISL_402119,2019-12-30
1,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/IVDC-HB-04/2020,EPI_ISL_402120,2020-01-01
2,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/IVDC-HB-05/2019,EPI_ISL_402121,2019-12-30
3,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/IPBCAMS-WH-01/2019,EPI_ISL_402123,2019-12-24
4,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/WIV04/2019,EPI_ISL_402124,2019-12-30
...,...,...,...,...,...,...,...
33427,Europe,United Kingdom,England,-1,hCoV-19/England/NORW-E94D4/2020,EPI_ISL_453599,2020-04-22
33428,Europe,Netherlands,Leiden,-1,hCoV-19/Netherlands/Leiden-0002/2020,EPI_ISL_454416,2020-03-09
33429,Asia,China,Meizhou,-1,hCoV-19/Meizhou/MZ01/2020,EPI_ISL_454417,2020-02-13
33430,Asia,China,Meizhou,-1,hCoV-19/Meizhou/MZ02/2020,EPI_ISL_454418,2020-02-15


# South Africa locations

In [41]:
location_df.loc[
    (location_df['country'] == 'South Africa') &
    (location_df['division'] == 'EC'),
    'division'
] = 'Eastern Cape'

location_df.loc[
    (location_df['country'] == 'South Africa') &
    (location_df['division'] == 'KZN'),
    'division'
] = 'KwaZulu-Natal'

location_df.loc[
    (location_df['country'] == 'South Africa') &
    (location_df['division'] == 'GP'),
    'division'
] = 'Gauteng'

location_df.loc[
    (location_df['country'] == 'South Africa') &
    (location_df['division'] == 'LP'),
    'division'
] = 'Limpopo'

location_df.loc[
    (location_df['country'] == 'South Africa') &
    (location_df['division'] == 'MP'),
    'division'
] = 'Mpumalanga'

location_df.loc[location_df['country'] == 'South Africa', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
534,Africa,South Africa,KwaZulu-Natal,-1,hCoV-19/South Africa/R03006/2020,EPI_ISL_417186,2020-03-07
7909,Africa,South Africa,KwaZulu-Natal,-1,hCoV-19/South Africa/KRISP-02/2020,EPI_ISL_421572,2020-03-23
7910,Africa,South Africa,KwaZulu-Natal,-1,hCoV-19/South Africa/KRISP-06/2020,EPI_ISL_421573,2020-03-31
7911,Africa,South Africa,KwaZulu-Natal,-1,hCoV-19/South Africa/KRISP-07/2020,EPI_ISL_421574,2020-04-01
7912,Africa,South Africa,KwaZulu-Natal,-1,hCoV-19/South Africa/KRISP-011/2020,EPI_ISL_421575,2020-04-01
7913,Africa,South Africa,KwaZulu-Natal,-1,hCoV-19/South Africa/KRISP-012/2020,EPI_ISL_421576,2020-04-01
11279,Africa,South Africa,Gauteng,-1,hCoV-19/South Africa/R02827/2020,EPI_ISL_430297,2020-03-06
15690,Africa,South Africa,KwaZulu-Natal,-1,hCoV-19/South Africa/R02606/2020,EPI_ISL_435058,2020-03-11
15691,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/R05475/2020,EPI_ISL_435059,2020-03-20
16329,Africa,South Africa,KwaZulu-Natal,-1,hCoV-19/South Africa/KRISP-04/2020,EPI_ISL_436684,2020-03-31


# Korea --> South Korea

In [42]:
location_df.loc[
    (location_df['country'] == 'Korea'),
    'country'
] = 'South Korea'

location_df.loc[location_df['country'] == 'South Korea', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
110,Asia,South Korea,-1,-1,hCoV-19/South Korea/KUMC03/2020,EPI_ISL_413513,2020-02-27
111,Asia,South Korea,-1,-1,hCoV-19/South Korea/KUMC04/2020,EPI_ISL_413514,2020-02-27
112,Asia,South Korea,-1,-1,hCoV-19/South Korea/KUMC05/2020,EPI_ISL_413515,2020-02-27
113,Asia,South Korea,-1,-1,hCoV-19/South Korea/KUMC06/2020,EPI_ISL_413516,2020-02-27
2113,Asia,South Korea,-1,-1,hCoV-19/South Korea/KUMC01/2020,EPI_ISL_413017,2020-02-06
2114,Asia,South Korea,-1,-1,hCoV-19/South Korea/KUMC02/2020,EPI_ISL_413018,2020-02-06
3767,Asia,South Korea,-1,-1,hCoV-19/Korea/BA-ACH_2604/2020,EPI_ISL_420799,2020-02-27
3768,Asia,South Korea,-1,-1,hCoV-19/Korea/BA-ACH_2718/2020,EPI_ISL_420800,2020-02-29
3769,Asia,South Korea,-1,-1,hCoV-19/Korea/BA-ACH_2719/2020,EPI_ISL_420801,2020-02-29
7613,Asia,South Korea,-1,-1,hCoV-19/Korea/KCDC2001/2020,EPI_ISL_425117,2020-02-23


# Taiwan

In [43]:
location_df.loc[
    (location_df['division'] == 'New Taipei city'),
    'division'
] = 'New Taipei City'

location_df.loc[location_df['country'] == 'Taiwan', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
31,Asia,Taiwan,Kaohsiung,-1,hCoV-19/Taiwan/2/2020,EPI_ISL_406031,2020-01-23
86,Asia,Taiwan,Kaohsiung,-1,hCoV-19/Taiwan/2/2020,EPI_ISL_406031,2020-01-23
161,Asia,Taiwan,Taipei,-1,hCoV-19/Taiwan/NTU03/2020,EPI_ISL_413592,2020-03-02
303,Asia,Taiwan,Taoyuan,-1,hCoV-19/Taiwan/CGMH-CGU-03/2020,EPI_ISL_415741,2020-02-26
304,Asia,Taiwan,Taoyuan,-1,hCoV-19/Taiwan/CGMH-CGU-04/2020,EPI_ISL_415742,2020-02-27
...,...,...,...,...,...,...,...
29545,Asia,Taiwan,New Taipei City,-1,hCoV-19/Taiwan/TSGH-29/2020,EPI_ISL_447255,2020-03-17
29546,Asia,Taiwan,Taipei,-1,hCoV-19/Taiwan/TSGH-30/2020,EPI_ISL_447256,2020-03-20
29547,Asia,Taiwan,New Taipei City,-1,hCoV-19/Taiwan/TSGH-31/2020,EPI_ISL_447257,2020-03-25
32689,Asia,Taiwan,Taoyuan,-1,hCoV-19/Taiwan/CGMH-CGU-26/2020,EPI_ISL_452178,2020-04-19


# France

In [44]:
location_df.loc[
    (location_df['division'] == 'Bourgogne Franche comte') |
    (location_df['division'] == 'Bourgogne-France-Comté') |
    (location_df['division'] == 'Bourgogne-Franche Comte') |
    (location_df['division'] == 'Bourgogne'),
    'division'
] = 'Bourgogne-Franche-Comté'

location_df.loc[
    (location_df['division'] == 'ARA'),
    'division'
] = 'Auvergne-Rhône-Alpes'

location_df.loc[
    (location_df['division'] == 'Centre - Val de Loire'),
    'division'
] = 'Centre-Val de Loire'

location_df.loc[
    (location_df['division'] == 'Grand-Est') |
    (location_df['division'] == 'Grand-est'),
    'division'
] = 'Grand Est'

location_df.loc[
    (location_df['division'] == 'Hauts De France') |
    (location_df['division'] == 'Hauts de France'),
    'division'
] = 'Hauts-de-France'

location_df.loc[
    (location_df['division'] == 'IDF') |
    (location_df['division'] == 'Ile De France') | 
    (location_df['division'] == 'Ile de France') | 
    (location_df['division'] == 'Ile-de-France'),
    'division'
] = 'Île-de-France'


location_df.loc[location_df['country'] == 'France', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
45,Europe,France,Île-de-France,Paris,hCoV-19/France/IDF0372/2020,EPI_ISL_406596,2020-01-23
46,Europe,France,Île-de-France,Paris,hCoV-19/France/IDF0373/2020,EPI_ISL_406597,2020-01-23
100,Europe,France,Île-de-France,Paris,hCoV-19/France/IDF0372/2020,EPI_ISL_406596,2020-01-23
101,Europe,France,Île-de-France,Paris,hCoV-19/France/IDF0373/2020,EPI_ISL_406597,2020-01-23
951,Europe,France,Auvergne-Rhône-Alpes,Privas,hCoV-19/France/ARA094100/2020,EPI_ISL_418412,2020-03-15
...,...,...,...,...,...,...,...
23096,Europe,France,-1,-1,hCoV-19/France/10003SN/2020,EPI_ISL_447719,2020-03-01
23097,Europe,France,-1,-1,hCoV-19/France/10025GM/2020,EPI_ISL_447720,2020-03-01
23098,Europe,France,-1,-1,hCoV-19/France/10002PM/2020,EPI_ISL_447721,2020-03-01
23099,Europe,France,-1,-1,hCoV-19/France/10026SL/2020,EPI_ISL_447722,2020-03-01


# Georgia

In [45]:
location_df.loc[
    (location_df['country'] == 'Georgia') &
    (location_df['region'] == 'Europe'),
    'region'
] = 'Asia'

location_df.loc[location_df['country'] == 'Georgia', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
1481,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb-390/2020,EPI_ISL_416477,2020-03-08
1482,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb-673/2020,EPI_ISL_416478,2020-03-14
1483,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb-273/2020,EPI_ISL_416479,2020-03-05
1484,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb-537/2020,EPI_ISL_416480,2020-03-11
1485,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb-712/2020,EPI_ISL_416481,2020-03-16
1486,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb/2020,EPI_ISL_416482,2020-03-13
2342,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb-54/2020,EPI_ISL_415641,2020-02-27
2343,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb-477/2020,EPI_ISL_415642,2020-03-10
2344,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb-468/2020,EPI_ISL_415643,2020-03-10
2345,Asia,Georgia,Tbilisi,-1,hCoV-19/Georgia/Tb-82/2020,EPI_ISL_415644,2020-02-28


# Germany

Move Munich division into Bavaria division

In [46]:
location_df.loc[
    (location_df['country'] == 'Germany') &
    (location_df['division'] == 'Munich'),
    ['division', 'location']
] = ['Bavaria', 'Munich'] 

location_df.loc[location_df['country'] == 'Germany', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
54,Europe,Germany,Bavaria,Munich,hCoV-19/Germany/BavPat1/2020,EPI_ISL_406862,2020-01-28
109,Europe,Germany,Bavaria,Munich,hCoV-19/Germany/BavPat1/2020,EPI_ISL_406862,2020-01-28
267,Europe,Germany,North Rhine Westphalia,Heinsberg District,hCoV-19/Germany/NRW-02-1/2020,EPI_ISL_414497,2020-02-25
268,Europe,Germany,North Rhine Westphalia,Heinsberg District,hCoV-19/Germany/NRW-03/2020,EPI_ISL_414498,2020-02-26
269,Europe,Germany,North Rhine Westphalia,Heinsberg District,hCoV-19/Germany/NRW-04/2020,EPI_ISL_414499,2020-02-26
...,...,...,...,...,...,...,...
32730,Europe,Germany,Frankfurt,-1,hCoV-19/Germany/FrankfurtFFM3/2020,EPI_ISL_452219,2020-03-02
32731,Europe,Germany,Frankfurt,-1,hCoV-19/Germany/FrankfurtFFM4/2020,EPI_ISL_452220,2020-03-02
32732,Europe,Germany,Frankfurt,-1,hCoV-19/Germany/FrankfurtFFM5/2020,EPI_ISL_452221,2020-03-02
32733,Europe,Germany,Frankfurt,-1,hCoV-19/Germany/FrankfurtFFM6/2020,EPI_ISL_452222,2020-03-02


# Poland

In [47]:
location_df.loc[
    (location_df['division'] == 'Dolnoslakie'),
    'division'
] = 'Dolnoslaskie'

location_df.loc[location_df['country'] == 'Poland', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
1492,Europe,Poland,Zielonogorskie,-1,hCoV-19/Poland/PL_P1/2020,EPI_ISL_416488,2020-03-03
9862,Europe,Poland,Pomerania,-1,hCoV-19/Poland/Pom1/2020,EPI_ISL_428209,2020-04-06
9866,Europe,Poland,Pomorskie,-1,hCoV-19/Poland/1104795/2020,EPI_ISL_428232,2020-03-18
9867,Europe,Poland,Pomorskie,-1,hCoV-19/Poland/1105644/2020,EPI_ISL_428233,2020-03-18
9868,Europe,Poland,Pomorskie,-1,hCoV-19/Poland/1105973/2020,EPI_ISL_428234,2020-03-19
...,...,...,...,...,...,...,...
31646,Europe,Poland,Malopolska,-1,hCoV-19/Poland/PL_P22/2020,EPI_ISL_451983,2020-04-02
31647,Europe,Poland,Malopolska,Kraków,hCoV-19/Poland/PL_P23/2020,EPI_ISL_451984,2020-03-29
31648,Europe,Poland,Malopolska,Kraków,hCoV-19/Poland/PL_P24/2020,EPI_ISL_451985,2020-03-29
31649,Europe,Poland,Malopolska,Kraków,hCoV-19/Poland/PL_P25/2020,EPI_ISL_451986,2020-04-07


# Russia

In [48]:
location_df.loc[
    (location_df['division'] == 'Moscow'),
    'division'
] = 'Moscow Region'

location_df.loc[
    (location_df['country'] == 'Russia') & (
        (location_df['division'] == 'Saint-Petersburg') |
        (location_df['division'] == 'St. Petersburg') |
        (location_df['division'] == 'St.Petersburg')
    ),
    'division'
] = 'Saint Petersburg'

location_df.loc[location_df['country'] == 'Russia', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
301,Europe,Russia,Saint Petersburg,-1,hCoV-19/Russia/StPetersburg-3524/2020,EPI_ISL_415710,2020-03-15
3904,Europe,Russia,Moscow Region,-1,hCoV-19/Russia/Moscow_PMVL-1/2020,EPI_ISL_421275,2020-03-18
4349,Europe,Russia,Saint Petersburg,-1,hCoV-19/Russia/StPetersburg-RII3992/2020,EPI_ISL_420080,2020-03-18
4350,Europe,Russia,Saint Petersburg,-1,hCoV-19/Russia/StPetersburg-RII3997/2020,EPI_ISL_420081,2020-03-18
9569,Europe,Russia,Saint Petersburg,-1,hCoV-19/Russia/StPetersburg-RII4144S/2020,EPI_ISL_427307,2020-03-20
...,...,...,...,...,...,...,...
31629,Europe,Russia,Moscow Region,-1,hCoV-19/Russia/SCPM-O-03/2020,EPI_ISL_451965,2020-03-20
31630,Europe,Russia,Moscow Region,-1,hCoV-19/Russia/SCPM-O-05/2020,EPI_ISL_451967,2020-03-21
31631,Europe,Russia,Moscow Region,-1,hCoV-19/Russia/SCPM-O-06/2020,EPI_ISL_451968,2020-03-18
31632,Europe,Russia,Moscow Region,-1,hCoV-19/Russia/SCPM-O-07/2020,EPI_ISL_451969,2020-03-22


# Spain

In [49]:
location_df.loc[
    (location_df['division'] == 'BasqueCountry') | 
    (location_df['division'] == 'Basque_Country'),
    'division'
] = 'Basque Country'

location_df.loc[
    (location_df['division'] == 'Castilla La Mancha'),
    'division'
] = 'Castilla-La Mancha'

location_df.loc[
    (location_df['division'] == 'Castilla y Leon'),
    'division'
] = 'Castilla y León'

location_df.loc[
    (location_df['division'] == 'Catalunya'),
    'division'
] = 'Catalonia'

location_df.loc[
    (location_df['division'] == 'Catalunya'),
    'division'
] = 'Catalonia'

location_df.loc[
    (location_df['division'] == 'Comunitat_Valenciana'),
    'division'
] = 'Comunitat Valenciana'

location_df.loc[
    (location_df['division'] == 'Comunitat Valenciana') &
    (location_df['location'] == 'Bonrepos_i_Mirambell'),
    'location'
] = 'Bonrepos i Mirambell'

location_df.loc[
    (location_df['division'] == 'Comunitat Valenciana') &
    (location_df['location'] == 'Canet_d\'En_Berenguer'),
    'location'
] = 'Canet d\'En Berenguer'

location_df.loc[
    (location_df['division'] == 'Comunitat Valenciana') &
    (location_df['location'] == 'El_Puig'),
    'location'
] = 'El Puig'

location_df.loc[
    (location_df['division'] == 'Comunitat Valenciana') &
    (location_df['location'] == 'Grau_de_Sagunt'),
    'location'
] = 'Grau de Sagunt'

location_df.loc[
    (location_df['division'] == 'Comunitat Valenciana') &
    (location_df['location'] == 'Palma_de_Gandia'),
    'location'
] = 'Palma de Gandia'

location_df.loc[
    (location_df['division'] == 'Comunitat Valenciana') &
    (location_df['location'] == 'Tavernes_de_la_Valldigna'),
    'location'
] = 'Tavernes de la Valldigna'

location_df.loc[
    (location_df['division'] == 'Comunitat Valenciana') &
    (location_df['location'] == 'Valencia_h'),
    'location'
] = 'Valencia'

location_df.loc[
    (location_df['division'] == 'LaRioja'),
    'division'
] = 'La Rioja'

location_df.loc[location_df['country'] == 'Spain', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
265,Europe,Spain,Comunitat Valenciana,Valencia,hCoV-19/Spain/Valencia1/2020,EPI_ISL_414495,2020-03-08
266,Europe,Spain,Comunitat Valenciana,Valencia,hCoV-19/Spain/Valencia2/2020,EPI_ISL_414496,2020-03-04
553,Europe,Spain,Comunitat Valenciana,Valencia,hCoV-19/Spain/Valencia9/2020,EPI_ISL_417205,2020-03-02
554,Europe,Spain,Comunitat Valenciana,Valencia,hCoV-19/Spain/Valencia10/2020,EPI_ISL_417206,2020-03-06
768,Europe,Spain,Madrid,-1,hCoV-19/Spain/Madrid_H2_16/2020,EPI_ISL_417952,2020-03-09
...,...,...,...,...,...,...,...
32991,Europe,Spain,Basque Country,Vitoria,hCoV-19/Spain/COV000857/2020,EPI_ISL_452695,2020-03-12
32992,Europe,Spain,Basque Country,Vitoria,hCoV-19/Spain/COV000856/2020,EPI_ISL_452696,2020-03-11
32993,Europe,Spain,Basque Country,Vitoria,hCoV-19/Spain/COV000855/2020,EPI_ISL_452697,2020-03-11
32994,Europe,Spain,Basque Country,Vitoria,hCoV-19/Spain/COV000853/2020,EPI_ISL_452698,2020-03-11


# Sweden

In [50]:
location_df.loc[
    (location_df['division'] == 'Vasterbotten'),
    'division'
] = 'Västerbotten'

location_df.loc[location_df['country'] == 'Sweden', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
7584,Europe,Sweden,Västerbotten,-1,hCoV-19/Sweden/RV-FOI-1/2020,EPI_ISL_424703,2020-03-15
9854,Europe,Sweden,Västerbotten,-1,hCoV-19/Sweden/RV-FOI-6/2020,EPI_ISL_428201,2020-03-20
10774,Europe,Sweden,Västerbotten,-1,hCoV-19/Sweden/RV-FOI-4/2020,EPI_ISL_428148,2020-03-18
10888,Europe,Sweden,Halland,-1,hCoV-19/Sweden/20-08142/2020,EPI_ISL_429115,2020-03-11
10889,Europe,Sweden,Halland,-1,hCoV-19/Sweden/20-08143/2020,EPI_ISL_429116,2020-03-15
...,...,...,...,...,...,...,...
32744,Europe,Sweden,Vastra Gotaland,-1,hCoV-19/Sweden/20-07908/2020,EPI_ISL_452238,2020-04-27
32745,Europe,Sweden,Vastra Gotaland,-1,hCoV-19/Sweden/20-07909/2020,EPI_ISL_452239,2020-04-25
32746,Europe,Sweden,Stockholm,-1,hCoV-19/Sweden/20-07921/2020,EPI_ISL_452240,2020-04-24
32747,Europe,Sweden,Halland,-1,hCoV-19/Sweden/20-14014/2020,EPI_ISL_452241,2020-04-24


# Switzerland

In [51]:
location_df.loc[
    (location_df['country'] == 'Switzerland') &
    (location_df['division'] == 'Basel'),
    'division'
] = 'Basel-Stadt'

location_df.loc[
    (location_df['country'] == 'Switzerland') &
    (location_df['division'] == 'Genève'),
    'division'
] = 'Geneva'

location_df.loc[location_df['country'] == 'Switzerland', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
172,Europe,Switzerland,Tessin,-1,hCoV-19/Switzerland/TI9486/2020,EPI_ISL_413996,2020-02-24
173,Europe,Switzerland,Geneva,-1,hCoV-19/Switzerland/GE3895/2020,EPI_ISL_413997,2020-02-26
174,Europe,Switzerland,Argovie,-1,hCoV-19/Switzerland/AG0361/2020,EPI_ISL_413999,2020-02-27
188,Europe,Switzerland,Geneva,-1,hCoV-19/Switzerland/GE3121/2020,EPI_ISL_414019,2020-02-27
189,Europe,Switzerland,Geneva,-1,hCoV-19/Switzerland/GE5373/2020,EPI_ISL_414020,2020-02-27
...,...,...,...,...,...,...,...
31619,Europe,Switzerland,Zurich,-1,hCoV-19/Switzerland/101127/2020,EPI_ISL_451795,2020-03-30
31620,Europe,Switzerland,Bern,-1,hCoV-19/Switzerland/101129/2020,EPI_ISL_451796,2020-03-30
31621,Europe,Switzerland,Zurich,-1,hCoV-19/Switzerland/101130/2020,EPI_ISL_451797,2020-03-30
31622,Europe,Switzerland,Zurich,-1,hCoV-19/Switzerland/101131/2020,EPI_ISL_451798,2020-03-30


# United Kingdon

In [52]:
location_df.loc[
    (location_df['country'] == 'United Kingdom') &
    (location_df['location'] == 'Northamtonshire'),
    'location'
] = 'Northamptonshire'

location_df.loc[location_df['country'] == 'United Kingdom', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
124,Europe,United Kingdom,Wales,-1,hCoV-19/Wales/PHW1/2020,EPI_ISL_413555,2020-02-27
125,Europe,United Kingdom,Wales,-1,hCoV-19/Wales/PHW2/2020,EPI_ISL_413556,2020-03-04
175,Europe,United Kingdom,England,-1,hCoV-19/England/200940527/2020,EPI_ISL_414005,2020-02-25
176,Europe,United Kingdom,England,-1,hCoV-19/England/200990724/2020,EPI_ISL_414006,2020-02-28
177,Europe,United Kingdom,England,-1,hCoV-19/England/200990725/2020,EPI_ISL_414007,2020-02-28
...,...,...,...,...,...,...,...
33423,Europe,United Kingdom,England,-1,hCoV-19/England/NORW-E93AA/2020,EPI_ISL_453595,2020-04-25
33424,Europe,United Kingdom,England,-1,hCoV-19/England/NORW-E93B9/2020,EPI_ISL_453596,2020-04-27
33425,Europe,United Kingdom,England,-1,hCoV-19/England/NORW-E947A/2020,EPI_ISL_453597,2020-04-22
33426,Europe,United Kingdom,England,-1,hCoV-19/England/NORW-E94B6/2020,EPI_ISL_453598,2020-04-22


# Canada

In [53]:
location_df.loc[
    (location_df['country'] == 'Canada') &
    (location_df['division'] == 'MB'),
    'division'
] = 'Manitoba'

location_df.loc[
    (location_df['country'] == 'Canada') &
    (location_df['division'] == 'NB'),
    'division'
] = 'New Brunswick'

location_df.loc[
    (location_df['country'] == 'Canada') &
    (location_df['division'] == 'NL'),
    'division'
] = 'Newfoundland and Labrador'

location_df.loc[
    (location_df['country'] == 'Canada') &
    (location_df['division'] == 'NS'),
    'division'
] = 'Nova Scotia'

location_df.loc[
    (location_df['country'] == 'Canada') &
    (location_df['division'] == 'SK'),
    'division'
] = 'Saskatchewan'

location_df.loc[location_df['country'] == 'Canada', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
1011,North America,Canada,Saskatchewan,-1,hCoV-19/Canada/SK_4/2020,EPI_ISL_418810,2020-03-09
1012,North America,Canada,New Brunswick,-1,hCoV-19/Canada/NB_6/2020,EPI_ISL_418811,2020-03-12
1013,North America,Canada,Manitoba,-1,hCoV-19/Canada/MB_8/2020,EPI_ISL_418812,2020-03-12
1014,North America,Canada,Manitoba,-1,hCoV-19/Canada/MB_10/2020,EPI_ISL_418813,2020-03-13
1015,North America,Canada,Nova Scotia,-1,hCoV-19/Canada/NS_13/2020,EPI_ISL_418814,2020-03-13
...,...,...,...,...,...,...,...
31013,North America,Canada,Quebec,-1,hCoV-19/Canada/Qc-L00241245/2020,EPI_ISL_450315,2020-03-27
31014,North America,Canada,Quebec,-1,hCoV-19/Canada/Qc-L00241292/2020,EPI_ISL_450316,2020-03-27
31015,North America,Canada,Quebec,-1,hCoV-19/Canada/Qc-L00241344/2020,EPI_ISL_450317,2020-03-27
31016,North America,Canada,Quebec,-1,hCoV-19/Canada/Qc-L00241355/2020,EPI_ISL_450318,2020-03-27


# Mexico

In [54]:
location_df.loc[
    (location_df['country'] == 'Mexico') &
    (location_df['division'] == 'CDMX'),
    'division'
] = 'Mexico City'

location_df.loc[location_df['country'] == 'Mexico', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
1112,North America,Mexico,Mexico City,-1,hCoV-19/Mexico/CDMX-InDRE_01/2020,EPI_ISL_412972,2020-02-27
5495,North America,Mexico,Mexico City,-1,hCoV-19/Mexico/CDMX-INER_01/2020,EPI_ISL_424345,2020-03-12
5498,North America,Mexico,Mexico City,-1,hCoV-19/Mexico/CDMX-INER_02/2020,EPI_ISL_424348,2020-03-13
7535,North America,Mexico,Mexico City,-1,hCoV-19/Mexico/CDMX-INER_03/2020,EPI_ISL_424625,2020-03-13
7536,North America,Mexico,Mexico City,-1,hCoV-19/Mexico/CDMX-INER_04/2020,EPI_ISL_424626,2020-03-15
7537,North America,Mexico,Mexico City,-1,hCoV-19/Mexico/CDMX-INER_05/2020,EPI_ISL_424627,2020-03-16
7576,North America,Mexico,Chiapas,-1,hCoV-19/Mexico/Chiapas-InDRE_02/2020,EPI_ISL_424666,2020-02-29
7577,North America,Mexico,Estado de Mexico,-1,hCoV-19/Mexico/EdoMex-InDRE_03/2020,EPI_ISL_424667,2020-03-04
7580,North America,Mexico,Queretaro,-1,hCoV-19/Mexico/Queretaro-InDRE_04/2020,EPI_ISL_424670,2020-03-10
7582,North America,Mexico,Puebla,-1,hCoV-19/Mexico/Puebla-InDRE_05/2020,EPI_ISL_424672,2020-03-11


# Who misspelled north???

In [55]:
location_df.loc[
    location_df['region'] == 'Noth America',
    'region'
] = 'North America'

# USA

In [56]:
location_df.loc[
    (location_df['division'] == 'District of Columbia'),
    'division'
] = 'Washington DC'


# California
location_df.loc[
    (location_df['division'] == 'California') & (
        (location_df['location'] == 'Grand Princess') |
        (location_df['location'] == 'Grand Princess cruise ship')
    ),
    'location'
] = 'Grand Princess Cruise Ship'

location_df.loc[
    (location_df['division'] == 'California') & (
        (location_df['location'] == 'San Diego')
    ),
    'location'
] = 'San Diego County'

location_df.loc[
    (location_df['division'] == 'California') & (
        (location_df['location'] == 'San Francisco')
    ),
    'location'
] = 'San Francisco County'


# I'm assuming LA is Louisiana, and not Los Angeles
location_df.loc[
    (location_df['division'] == 'LA'),
    'division'
] = 'Louisiana'

# I'm assuming NY is NY State, and not NYC
location_df.loc[
    (location_df['division'] == 'NY'),
    'division'
] = 'New York'

# Move NYC into New York
location_df.loc[
    (location_df['division'] == 'New York City'),
    ['division', 'location']
] = ['New York', 'New York City']

location_df.loc[
    (location_df['division'] == 'New York') & (
        (location_df['location'] == 'Nassau') |
        (location_df['location'] == 'Nassau county')
    ),
    'location'
] = 'Nassau County'

location_df.loc[
    (location_df['division'] == 'New York') & (
        (location_df['location'] == 'Rockland')
    ),
    'location'
] = 'Rockland County'

location_df.loc[
    (location_df['division'] == 'New York') & (
        (location_df['location'] == 'Suffolk') |
        (location_df['location'] == 'Suffolk county')
    ),
    'location'
] = 'Suffolk County'

# Wisconsin
location_df.loc[
    (location_df['division'] == 'Wisconsin') & (
        (location_df['location'] == 'Campbellsp')
    ),
    'location'
] = 'Campbellsport'

location_df.loc[
    (location_df['division'] == 'Wisconsin') & (
        (location_df['location'] == 'Jackson')
    ),
    'location'
] = 'Jackson County'


location_df.loc[location_df['country'] == 'USA', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
27,North America,USA,Illinois,Chicago,hCoV-19/USA/IL1/2020,EPI_ISL_404253,2020-01-21
28,North America,USA,Washington,Snohomish County,hCoV-19/USA/WA1/2020,EPI_ISL_404895,2020-01-19
32,North America,USA,California,Los Angeles,hCoV-19/USA/CA1/2020,EPI_ISL_406034,2020-01-23
33,North America,USA,California,Orange County,hCoV-19/USA/CA2/2020,EPI_ISL_406036,2020-01-22
34,North America,USA,Arizona,Phoenix,hCoV-19/USA/AZ1/2020,EPI_ISL_406223,2020-01-22
...,...,...,...,...,...,...,...
32792,North America,USA,Michigan,-1,hCoV-19/USA/MI-MDHHS-SC20421/2020,EPI_ISL_452295,2020-03-17
32793,North America,USA,Michigan,-1,hCoV-19/USA/MI-MDHHS-SC20422/2020,EPI_ISL_452296,2020-03-16
32794,North America,USA,Michigan,-1,hCoV-19/USA/MI-MDHHS-SC20423/2020,EPI_ISL_452297,2020-03-16
32795,North America,USA,Michigan,-1,hCoV-19/USA/MI-MDHHS-SC20424/2020,EPI_ISL_452298,2020-03-17


# Australia

In [57]:
location_df.loc[
    (location_df['country'] == 'Australia') &
    (location_df['division'] == 'NSW'),
    'division'
] = 'New South Wales'

location_df.loc[
    (location_df['country'] == 'Australia') &
    (location_df['division'] == 'Northern territory'),
    'division'
] = 'Northern Territory'



location_df.loc[location_df['country'] == 'Australia', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
53,Oceania,Australia,Victoria,Clayton,hCoV-19/Australia/VIC01/2020,EPI_ISL_406844,2020-01-25
108,Oceania,Australia,Victoria,Clayton,hCoV-19/Australia/VIC01/2020,EPI_ISL_406844,2020-01-25
163,Oceania,Australia,New South Wales,Sydney,hCoV-19/Australia/NSW08/2020,EPI_ISL_413594,2020-02-28
164,Oceania,Australia,New South Wales,Sydney,hCoV-19/Australia/NSW09/2020,EPI_ISL_413595,2020-02-28
165,Oceania,Australia,New South Wales,Sydney,hCoV-19/Australia/NSW10/2020,EPI_ISL_413596,2020-02-28
...,...,...,...,...,...,...,...
31479,Oceania,Australia,New South Wales,Sydney,hCoV-19/Australia/NSW379/2020,EPI_ISL_451639,2020-04-13
31480,Oceania,Australia,New South Wales,Sydney,hCoV-19/Australia/NSW380/2020,EPI_ISL_451640,2020-04-13
31481,Oceania,Australia,New South Wales,Sydney,hCoV-19/Australia/NSW381/2020,EPI_ISL_451641,2020-04-06
31482,Oceania,Australia,New South Wales,Sydney,hCoV-19/Australia/NSW382/2020,EPI_ISL_451642,2020-04-21


# Brazil

In [58]:
location_df.loc[
    (location_df['country'] == 'Brazil') &
    (location_df['division'] == 'Minas gerais'),
    'division'
] = 'Minas Gerais'

location_df.loc[
    (location_df['country'] == 'Brazil') &
    (location_df['division'] == 'São Paulo'),
    'division'
] = 'Sao Paulo'

# Remove Sao Paulo location
location_df.loc[
    (location_df['country'] == 'Brazil') &
    (location_df['location'] == 'Sao Paulo'),
    'location'
] = -1

location_df.loc[location_df['country'] == 'Brazil', :]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
184,South America,Brazil,Sao Paulo,-1,hCoV-19/Brazil/SPBR-03/2020,EPI_ISL_414014,2020-03-02
185,South America,Brazil,Sao Paulo,-1,hCoV-19/Brazil/SPBR-06/2020,EPI_ISL_414015,2020-02-29
186,South America,Brazil,Sao Paulo,-1,hCoV-19/Brazil/SPBR-05/2020,EPI_ISL_414016,2020-02-29
187,South America,Brazil,Sao Paulo,-1,hCoV-19/Brazil/SPBR-04/2020,EPI_ISL_414017,2020-03-04
202,South America,Brazil,Rio de Janeiro,Rio de Janeiro,hCoV-19/Brazil/RJ-314/2020,EPI_ISL_414045,2020-03-04
...,...,...,...,...,...,...,...
14100,South America,Brazil,Minas Gerais,-1,hCoV-19/Brazil/CV44/2020,EPI_ISL_429697,2020-03-18
14101,South America,Brazil,Minas Gerais,-1,hCoV-19/Brazil/CV45/2020,EPI_ISL_429698,2020-03-20
14102,South America,Brazil,Minas Gerais,-1,hCoV-19/Brazil/CV46/2020,EPI_ISL_429699,2020-03-20
31269,South America,Brazil,Amapa,-1,hCoV-19/Brazil/AP161167-IEC/2020,EPI_ISL_450873,2020-03-17
