In [1]:
from aurora import download
import numpy as np
import plotly.express as px

In [2]:
# some helper functions

def add_jitter(df):
    df['jitter_y'] = 0.00001 * np.random.randint(0,99,df.coordinate_y.size) * np.random.choice([-1,1])
    df['jitter_x'] = 0.00002 * np.random.randint(0,99,df.coordinate_x.size) * np.random.choice([-1,1])
    df['coord_jx'] = df['coordinate_x'] + df['jitter_x']
    df['coord_jy'] = df['coordinate_y'] + df['jitter_y']
    return df

def plot(df, hover_data, hover_name, write_html=None):
    df_jitter = add_jitter(df)
    fig = px.density_mapbox(df_jitter,
                            lat="coord_jy", lon="coord_jx",
                            hover_data=hover_data,
                            hover_name=hover_name,
                            radius=3, zoom=1)
    fig.update_layout(mapbox_style="open-street-map")
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    if write_html:
        fig.write_html(write_html)
    else:
        fig.show()

## all geocoding results

In [3]:
query = 'select * from biosample_geocode'
df = download(query)
df = df[~((df['coordinate_x'] < -180) | (df['coordinate_x'] > 180) | (df['coordinate_y'] < -90) | (df['coordinate_y'] > 90))]
plot(df,
     hover_data=["coordinate_y", "coordinate_x"],
     hover_name="geo_text_extracted",
     write_html="plots/geocoded_all.html"
)

### potential useless labels

In [4]:
import pycountry
country_names = {country.name.lower() for country in pycountry.countries}
other_valid_geo_keywords = {'usa', 'russia', 'korea', 'nigeria', 'venezuela', 'iran', 'tanzania', 'england', 'cape verde', 'czech republic', 'taiwan', 'kosovo', 'macedonia', 'bolivia'}
all_valid_geo_keywords = country_names | other_valid_geo_keywords

def has_country(text):
    if any(country in text for country in all_valid_geo_keywords):
        return True
    return False

# get text without valid keyword
df = df[~df['geo_text_extracted'].apply(has_country)]

plot(df,
     hover_data=["coordinate_y", "coordinate_x"],
     hover_name="geo_text_extracted",
     write_html="plots/geocoded_no_keyword.html"
)

## RdRP-pos geospatial info

In [5]:
query = '''
select sgc.sra_id, biosample_id, release_date, coordinate_x, coordinate_y, from_text
from srarun_geo_coordinates sgc 
inner join rdrp_pos rp on (sgc.sra_id = rp.sra_id)
'''
df = download(query)
df = df[~((df['coordinate_x'] < -180) | (df['coordinate_x'] > 180) | (df['coordinate_y'] < -90) | (df['coordinate_y'] > 90))]

plot(df,
     hover_data=["coordinate_y", "coordinate_x", 'from_text', 'biosample_id'],
     hover_name="sra_id",
     write_html="plots/geocoded_rdrp_pos.html"
)