In [1]:
import datetime
from pathlib import Path

In [3]:
import pandas as pd

In [4]:
INGEST_DATETIME = "20200315"

In [5]:
p = Path.cwd()

In [6]:
df = pd.read_csv(p / f"jogl-opencovid19-initial-survey-{INGEST_DATETIME}.csv")

In [7]:
column_names = ["timestamp", "name", 
           "email", 
           "whatsapp", 
           "city", 
           "affiliation", 
           "field", 
           "skills", 
           "has_biolab", 
           "how_found_opencovid", 
           "why_interested",
           "questions_concerns",
           "subobjective_interest",
           "other_interest", 
           "resource_needs", 
           "resource_availability", 
           "has_google_group_access", 
           "google_account", 
           "has_slack", 
           "comments"]

In [10]:
old_columns = df.columns
df.columns = column_names

## Geolocate/geocode locations

In [11]:
import geopy
from geopy.extra.rate_limiter import RateLimiter

In [15]:
geolocator = geopy.Nominatim(user_agent="jogl-covid19")
geocoder = RateLimiter(geolocator.geocode, min_delay_seconds=2)

In [104]:
location_df = (df["city"]
               .drop_duplicates()
               .rename("original_city")
               .to_frame())

In [105]:
if Path("geocodes.pickle").exists():
    geocodes_df = pd.read_pickle("geocodes.pickle")
    location_df = location_df.merge(geocodes_df, how="left", on="original_city")

In [106]:
location_df

Unnamed: 0,original_city,geocode
0,"Seattle, WA, USA","(Seattle, King County, Washington, United Stat..."
1,"Victoria, BC","(Victoria, Capital Regional District, British ..."
2,"Bellevue, WA, USA","(Bellevue, King County, Washington, United Sta..."
3,"Coventry, UK","(Coventry, West Midlands Combined Authority, W..."
4,USA Baltimore MD,"(Hanover Street Bridge, Port Covington, Baltim..."
...,...,...
104,Toronto,"(Toronto, Golden Horseshoe, Ontario, M6K 1X9, ..."
105,Hanover,"(Hannover, Region Hannover, Niedersachsen, Deu..."
106,"Vancouver, Canada","(Vancouver, Metro Vancouver Regional District,..."
107,Madrid / Shenzhen,


In [107]:
location_df.loc[lambda d: d.geocode.isna(), "geocode"] = location_df.loc[lambda d: d.geocode.isna(), "original_city"].apply(geocoder)

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Europe in March and April',), **{}).
Traceback (most recent call last):
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/urllib/request.py", line 1319, in do_open
    h.request(req.get_method(), req.selector, req.data, headers,
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 1230, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 1276, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 1225, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 1004, in _send_output
    self.send(msg)
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 944, in send
    self.connec

In [108]:
location_df

Unnamed: 0,original_city,geocode
0,"Seattle, WA, USA","(Seattle, King County, Washington, United Stat..."
1,"Victoria, BC","(Victoria, Capital Regional District, British ..."
2,"Bellevue, WA, USA","(Bellevue, King County, Washington, United Sta..."
3,"Coventry, UK","(Coventry, West Midlands Combined Authority, W..."
4,USA Baltimore MD,"(Hanover Street Bridge, Port Covington, Baltim..."
...,...,...
104,Toronto,"(Toronto, Golden Horseshoe, Ontario, M6K 1X9, ..."
105,Hanover,"(Hannover, Region Hannover, Niedersachsen, Deu..."
106,"Vancouver, Canada","(Vancouver, Metro Vancouver Regional District,..."
107,Madrid / Shenzhen,


In [53]:
location_df["geocode_raw"] = location_df.loc[lambda d: ~d.geocode.isna(), "geocode"].apply(lambda x: x.raw)

In [61]:
geocodes = pd.DataFrame.from_records(location_df.loc[lambda d: ~d.geocode.isna(), "geocode_raw"])

In [62]:
geocodes["query"] = location_df.loc[lambda d: ~d.geocode.isna(),"query"]

In [65]:
geocodes.to_csv("geocodes.csv", index=False)

In [69]:
location_df.loc[lambda d: ~d.geocode.isna(), ["original_city", "geocode"]].to_pickle("geocodes.pickle")

## Location cleaning by hand

In [111]:
location_df.loc[lambda d: d.geocode.isna(), :]

Unnamed: 0,original_city,geocode,query
20,Europe in March and April,,Europe in March and April
24,Stockholm+Berlin,,Stockholm+Berlin
55,"Cologne/Hamburg, Germany",,"Cologne/Hamburg, Germany"
98,"Johanessburg, South Africa",,"Johanessburg, South Africa"
107,Madrid / Shenzhen,,Madrid / Shenzhen


In [112]:
location_df["query"] = location_df["original_city"]

In [113]:
def correct_location(loc):
    location_corrections = {
        "Europe in March and April": ["Europe"],
        "Johanessburg, South Africa": ["Johannesburg, South Africa"],
        "Stockholm+Berlin": ["Stockholm", "Berlin"],
        "Cologne/Hamburg, Germany": ["Cologne, Germany", "Hamburg, Germany"],
        "Madrid / Shenzhen": ["Madrid", "Shenzhen"],
    }
    
    if loc in location_corrections:
        return location_corrections[loc]
    return [loc]

location_df["query"] = location_df["original_city"].apply(correct_location)

In [114]:
location_df = location_df.explode("query")

In [117]:
location_df.loc[lambda d: d.original_city.str.contains("Barce"),:]

Unnamed: 0,original_city,geocode,query
91,Barcelona,"(Barcelona, Barcelonès, Barcelona, Catalunya, ...",Barcelona
92,Barcelon,"(Barcelon, 3, Pipinstraße, Georgsviertel, Alts...",Barcelon


In [118]:
df.loc[lambda d: d.city.str.contains("Barce"),:]

Unnamed: 0,timestamp,name,email,whatsapp,city,affiliation,field,skills,has_biolab,how_found_opencovid,why_interested,questions_concerns,subobjective_interest,other_interest,resource_needs,resource_availability,has_google_group_access,google_account,has_slack,comments
102,3/14/2020 11:50:21,Bruno Lusic,brunolusic@gmail.com,447591112092,Barcelona,Imperial College,"Economics, finance and big data","Big data, investments and finance",No,,"Contribute with experience, capital and networ...",,"Sub-Objective 3 - Testing the Protocols, Sub-O...",Funding and logistics,,,No,,Yes,
103,3/14/2020 12:15:46,Bruno Lusic,brunolusic@gmail.com,7591112092,Barcelon,Imperial College London,"Economics, finance and big data","Big data, finance and investments",No,,"Contribute with experience, capital and networ...",,"Sub-Objective 3 - Testing the Protocols, Sub-O...",Funding and logistics,,,No,,Yes,


In [116]:
location_df.loc[lambda d: d.geocode.isna(), :]

Unnamed: 0,original_city,geocode,query
20,Europe in March and April,,Europe
24,Stockholm+Berlin,,Stockholm
24,Stockholm+Berlin,,Berlin
55,"Cologne/Hamburg, Germany",,"Cologne, Germany"
55,"Cologne/Hamburg, Germany",,"Hamburg, Germany"
98,"Johanessburg, South Africa",,"Johannesburg, South Africa"
107,Madrid / Shenzhen,,Madrid
107,Madrid / Shenzhen,,Shenzhen


In [120]:
no_geocode = location_df.geocode.isna()
location_df.loc[no_geocode, "geocode"] = location_df.loc[no_geocode, "query"].apply(geocoder)

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Johannesburg, South Africa',), **{}).
Traceback (most recent call last):
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/urllib/request.py", line 1319, in do_open
    h.request(req.get_method(), req.selector, req.data, headers,
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 1230, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 1276, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 1225, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 1004, in _send_output
    self.send(msg)
  File "/home/jyh/.pyenv/versions/3.8.1/lib/python3.8/http/client.py", line 944, in send
    self.conne

In [158]:
geo_df = pd.DataFrame.from_records([g.raw for g in location_df["geocode"]], index=location_df.index)

In [180]:
import folium
from folium.plugins import FastMarkerCluster
# plot cities on map

m = folium.Map(
    #no_wrap=True,
    crs='EPSG3857'
    )

loc_list = geo_df[['lat', 'lon']].values.tolist()
FastMarkerCluster(loc_list).add_to(m)

#for i in range(len(geo_df)):
#    folium.CircleMarker(location=loc_list[i], 
#                        radius=10,
#                        fill=True,
#                        color='#40B8AF',
#                        popup=location_df["query"].tolist()[i],
#                        ).add_to(m)

folium.plugins.Fullscreen(position='bottomright').add_to(m)

m