# Find a way to embed sequence count information into the location tree

In [1]:
import numpy as np
import pandas as pd
import re

In [59]:
df = pd.read_csv('/Volumes/GoogleDrive/My Drive/covid_data/taxon_locations.csv')
df

Unnamed: 0,name,gisaid_id,sample_date,location_id,region,country,division,location
0,hCoV-19/Algeria/G0640_2265/2020,EPI_ISL_418242,2020-03-08,0,Africa,Algeria,Blida,-1
1,hCoV-19/Algeria/G0638_2264/2020,EPI_ISL_418241,2020-03-02,1,Africa,Algeria,Boufarik,-1
2,hCoV-19/Algeria/G0860_2262/2020,EPI_ISL_420037,2020-03-02,1,Africa,Algeria,Boufarik,-1
3,hCoV-19/DRC/431/2020,EPI_ISL_420847,2020-03-26,2,Africa,Democratic Republic of the Congo,-1,-1
4,hCoV-19/DRC/3089/2020,EPI_ISL_437358,2020-04-19,3,Africa,Democratic Republic of the Congo,Kinshasa,-1
...,...,...,...,...,...,...,...,...
37392,hCoV-19/Uruguay/UY-NYUMC859/2020,EPI_ISL_457955,2020-03-24,1010,South America,Uruguay,Montevideo,-1
37393,hCoV-19/Uruguay/UY-NYUMC858/2020,EPI_ISL_457954,2020-03-23,1010,South America,Uruguay,Montevideo,-1
37394,hCoV-19/Uruguay/UY-NYUMC857/2020,EPI_ISL_457953,2020-03-23,1010,South America,Uruguay,Montevideo,-1
37395,hCoV-19/Uruguay/UY-NYUMC877/2020,EPI_ISL_457973,2020-04-22,1010,South America,Uruguay,Montevideo,-1


In [60]:
df.loc[df['region'] == '-1', 'region'] = None
df.loc[df['country'] == '-1', 'country'] = None
df.loc[df['division'] == '-1', 'division'] = None
df.loc[df['location'] == '-1', 'location'] = None
df

Unnamed: 0,name,gisaid_id,sample_date,location_id,region,country,division,location
0,hCoV-19/Algeria/G0640_2265/2020,EPI_ISL_418242,2020-03-08,0,Africa,Algeria,Blida,
1,hCoV-19/Algeria/G0638_2264/2020,EPI_ISL_418241,2020-03-02,1,Africa,Algeria,Boufarik,
2,hCoV-19/Algeria/G0860_2262/2020,EPI_ISL_420037,2020-03-02,1,Africa,Algeria,Boufarik,
3,hCoV-19/DRC/431/2020,EPI_ISL_420847,2020-03-26,2,Africa,Democratic Republic of the Congo,,
4,hCoV-19/DRC/3089/2020,EPI_ISL_437358,2020-04-19,3,Africa,Democratic Republic of the Congo,Kinshasa,
...,...,...,...,...,...,...,...,...
37392,hCoV-19/Uruguay/UY-NYUMC859/2020,EPI_ISL_457955,2020-03-24,1010,South America,Uruguay,Montevideo,
37393,hCoV-19/Uruguay/UY-NYUMC858/2020,EPI_ISL_457954,2020-03-23,1010,South America,Uruguay,Montevideo,
37394,hCoV-19/Uruguay/UY-NYUMC857/2020,EPI_ISL_457953,2020-03-23,1010,South America,Uruguay,Montevideo,
37395,hCoV-19/Uruguay/UY-NYUMC877/2020,EPI_ISL_457973,2020-04-22,1010,South America,Uruguay,Montevideo,


In [61]:
# Count sequences per region
dict(df.groupby('region')['gisaid_id'].count())

{'Africa': 359,
 'Asia': 3038,
 'Europe': 23941,
 'North America': 7431,
 'Oceania': 2146,
 'South America': 482}

In [62]:
# Count sequences by region & country
dict(df.groupby(['region', 'country'])['gisaid_id'].count())

{('Africa', 'Algeria'): 3,
 ('Africa', 'Democratic Republic of the Congo'): 133,
 ('Africa', 'Egypt'): 2,
 ('Africa', 'Gambia'): 3,
 ('Africa', 'Ghana'): 15,
 ('Africa', 'Kenya'): 112,
 ('Africa', 'Nigeria'): 17,
 ('Africa', 'Senegal'): 23,
 ('Africa', 'South Africa'): 31,
 ('Africa', 'Uganda'): 20,
 ('Asia', 'Bangladesh'): 24,
 ('Asia', 'Brunei'): 5,
 ('Asia', 'China'): 805,
 ('Asia', 'Georgia'): 15,
 ('Asia', 'Hong Kong'): 140,
 ('Asia', 'India'): 561,
 ('Asia', 'Indonesia'): 19,
 ('Asia', 'Iran'): 34,
 ('Asia', 'Israel'): 222,
 ('Asia', 'Japan'): 123,
 ('Asia', 'Jordan'): 28,
 ('Asia', 'Kazakhstan'): 53,
 ('Asia', 'Kuwait'): 8,
 ('Asia', 'Lebanon'): 11,
 ('Asia', 'Malaysia'): 22,
 ('Asia', 'Myanmar'): 1,
 ('Asia', 'Oman'): 35,
 ('Asia', 'Pakistan'): 4,
 ('Asia', 'Philippines'): 13,
 ('Asia', 'Qatar'): 16,
 ('Asia', 'Saudi Arabia'): 130,
 ('Asia', 'Singapore'): 148,
 ('Asia', 'South Korea'): 29,
 ('Asia', 'Sri Lanka'): 6,
 ('Asia', 'Taiwan'): 105,
 ('Asia', 'Thailand'): 386,
 ('Asia'

In [63]:
# Count sequences by region, country, and division
dict(df.groupby(['region', 'country', 'division'])['gisaid_id'].count())

{('Africa', 'Algeria', 'Blida'): 1,
 ('Africa', 'Algeria', 'Boufarik'): 2,
 ('Africa', 'Democratic Republic of the Congo', 'Kinshasa'): 130,
 ('Africa', 'Democratic Republic of the Congo', 'Sud Kivu'): 2,
 ('Africa', 'Gambia', 'West Coast Region'): 3,
 ('Africa', 'Ghana', 'Greater Accra'): 15,
 ('Africa', 'Nigeria', 'Kwara State'): 1,
 ('Africa', 'Nigeria', 'Lagos'): 1,
 ('Africa', 'Nigeria', 'Ogun State'): 1,
 ('Africa', 'Nigeria', 'Osun State'): 11,
 ('Africa', 'Nigeria', 'Oyo State'): 3,
 ('Africa', 'Senegal', 'Dakar'): 11,
 ('Africa', 'Senegal', 'Mbour'): 4,
 ('Africa', 'Senegal', 'St-Louis'): 1,
 ('Africa', 'Senegal', 'Thies'): 1,
 ('Africa', 'Senegal', 'Touba'): 6,
 ('Africa', 'South Africa', 'Eastern Cape'): 1,
 ('Africa', 'South Africa', 'Gauteng'): 3,
 ('Africa', 'South Africa', 'KwaZulu-Natal'): 24,
 ('Africa', 'South Africa', 'Limpopo'): 1,
 ('Africa', 'South Africa', 'Mpumalanga'): 1,
 ('Asia', 'Bangladesh', 'Chattogram'): 7,
 ('Asia', 'Bangladesh', 'Chuandanga'): 1,
 ('Asi

In [64]:
# Count sequences by region, country, division, and location
dict(df.groupby(['region', 'country', 'division', 'location'])['gisaid_id'].count())

{('Asia', 'Bangladesh', 'Chuandanga', 'Sadar'): 1,
 ('Asia', 'Bangladesh', 'Jhenaidha', 'Shailokupa'): 1,
 ('Asia', 'China', 'Anhui', 'Fuyang'): 1,
 ('Asia', 'China', 'Anhui', 'Suzhou'): 1,
 ('Asia', 'China', 'Guangdong', 'Dongguan'): 4,
 ('Asia', 'China', 'Guangdong', 'Foshan'): 7,
 ('Asia', 'China', 'Guangdong', 'Guangzhou'): 61,
 ('Asia', 'China', 'Guangdong', 'Huizhou'): 10,
 ('Asia', 'China', 'Guangdong', 'Meizhou'): 2,
 ('Asia', 'China', 'Guangdong', 'Shantou'): 2,
 ('Asia', 'China', 'Guangdong', 'Shanwei'): 1,
 ('Asia', 'China', 'Guangdong', 'Shaoguan'): 1,
 ('Asia', 'China', 'Guangdong', 'Shenzhen'): 25,
 ('Asia', 'China', 'Guangdong', 'Yangjiang'): 1,
 ('Asia', 'China', 'Guangdong', 'Zhanjiang'): 2,
 ('Asia', 'China', 'Guangdong', 'Zhongshan'): 1,
 ('Asia', 'China', 'Guangdong', 'Zhuhai'): 11,
 ('Asia', 'China', 'Hubei', 'Tianmen'): 1,
 ('Asia', 'China', 'Hubei', 'Wuhan'): 219,
 ('Asia', 'China', 'Jiangsu', 'Changzhou'): 1,
 ('Asia', 'China', 'Jiangxi', 'Fuzhou'): 3,
 ('Asia',

In [65]:
unique_location_df = pd.read_csv('/Volumes/GoogleDrive/My Drive/covid_data/location_map.csv')
unique_location_df

Unnamed: 0,index,region,country,division,location
0,0,Africa,Algeria,Blida,-1
1,1,Africa,Algeria,Boufarik,-1
2,2,Africa,Democratic Republic of the Congo,-1,-1
3,3,Africa,Democratic Republic of the Congo,Kinshasa,-1
4,4,Africa,Democratic Republic of the Congo,Sud Kivu,-1
...,...,...,...,...,...
1006,1006,South America,Colombia,Valle del Cauca,Yumbo
1007,1007,South America,Ecuador,Pichincha,-1
1008,1008,South America,Ecuador,Quito,-1
1009,1009,South America,Peru,Lima,-1


# Find a way to condense numbers

In [66]:
region_counts

{'Africa': 359,
 'Asia': 3038,
 'Europe': 23941,
 'North America': 7431,
 'Oceania': 2146,
 'South America': 482}

In [67]:
# Thanks to user "rtaft" from https://stackoverflow.com/questions/579310/formatting-long-numbers-as-strings-in-python
def human_format(num):
    num = float('{:.3g}'.format(num))
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])

human_format(23941)

'23.9K'

# Inject counts into location tree

In [68]:
region_counts = dict(df.groupby('region')['gisaid_id'].count())
country_counts = dict(df.groupby(['region', 'country'])['gisaid_id'].count())
division_counts = dict(df.groupby(['region', 'country', 'division'])['gisaid_id'].count())
location_counts = dict(df.groupby(['region', 'country', 'division', 'location'])['gisaid_id'].count())

In [69]:
# Root node
select_tree = {
    'label': 'All',
    'value': 'All',
    'children': []
}

for i, loc in unique_location_df.iterrows():
    # Add region node
    if loc['region'] == '-1':
        continue

    region_node = [c for c in select_tree['children'] if c['value'] == loc['region']]
    if region_node:
        region_node = region_node[0]
    else:
        region_node = {
            'label': loc['region'],
            'value': loc['region'],
            'level': 'region',
            'actions': [{
                'className': 'fa fa-info',
                'title': region_counts[loc['region']],
                'text': human_format(region_counts[loc['region']])
            }],
            'children': []
        }
        select_tree['children'].append(region_node)

    # Add country --> region
    if loc['country'] == '-1':
        continue

    country_node = [c for c in region_node['children'] if c['value'] == loc['country']]
    if country_node:
        country_node = country_node[0]
    else:
        country_node = {
            'label': loc['country'],
            'value': loc['country'],
            'region': loc['region'],
            'level': 'country',
            'actions': [{
                'className': 'fa fa-info',
                'title': country_counts[(loc['region'], loc['country'])],
                'text': human_format(country_counts[(loc['region'], loc['country'])])
            }],
            'children': []
        }
        region_node['children'].append(country_node)

    # Add division --> country
    if loc['division'] == '-1':
        continue

    division_node = [c for c in country_node['children'] if c['value'] == loc['division']]
    if division_node:
        division_node = division_node[0]
    else:
        division_node = {
            'label': loc['division'],
            'value': loc['division'],
            'region': loc['region'],
            'country': loc['country'],
            'level': 'division',
            'actions': [{
                'className': 'fa fa-info',
                'title': division_counts[(loc['region'], loc['country'], loc['division'])],
                'text': human_format(division_counts[(loc['region'], loc['country'], loc['division'])])
            }],
            'children': []
        }
        country_node['children'].append(division_node)

    # Add location --> division
    if loc['location'] == '-1':
        continue

    location_node = [c for c in division_node['children'] if c['value'] == loc['location']]
    if location_node:
        location_node = location_node[0]
    else:
        location_node = {
            'label': loc['location'],
            'value': loc['location'],
            'region': loc['region'],
            'country': loc['country'],
            'division': loc['division'],
            'level': 'location',
            'actions': [{
                'className': 'fa fa-info',
                'title': location_counts[(loc['region'], loc['country'], loc['division'], loc['location'])],
                'text': human_format(location_counts[(loc['region'], loc['country'], loc['division'], loc['location'])])
            }],
            'children': []
        }
        division_node['children'].append(location_node)
        
select_tree

{'label': 'All',
 'value': 'All',
 'children': [{'label': 'Africa',
   'value': 'Africa',
   'level': 'region',
   'actions': [{'className': 'fa fa-info', 'title': 359, 'text': '359'}],
   'children': [{'label': 'Algeria',
     'value': 'Algeria',
     'region': 'Africa',
     'level': 'country',
     'actions': [{'className': 'fa fa-info', 'title': 3, 'text': '3'}],
     'children': [{'label': 'Blida',
       'value': 'Blida',
       'region': 'Africa',
       'country': 'Algeria',
       'level': 'division',
       'actions': [{'className': 'fa fa-info', 'title': 1, 'text': '1'}],
       'children': []},
      {'label': 'Boufarik',
       'value': 'Boufarik',
       'region': 'Africa',
       'country': 'Algeria',
       'level': 'division',
       'actions': [{'className': 'fa fa-info', 'title': 2, 'text': '2'}],
       'children': []}]},
    {'label': 'Democratic Republic of the Congo',
     'value': 'Democratic Republic of the Congo',
     'region': 'Africa',
     'level': 'countr