In [1]:
import pandas as pd
import requests

# beautifulsoup4 can be used to precisely scrape data from the web page, but let's try read_html().
r = requests.get("https://www.worldometers.info/coronavirus/")

df = pd.read_html(r.text)

In [2]:
print(type(df))
print(len(df))

<class 'list'>
2


In [3]:
df[0]

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop
0,World,2359332,+28566,161950.0,+1903,606675.0,1590707,54526.0,303.00,20.8,,
1,USA,740151,+1359,39068.0,+54,68456.0,632627,13551.0,2236.00,118.0,3732979.0,11278.0
2,Spain,195944,+1528,20453.0,+410,77357.0,98134,7371.0,4191.00,437.0,930230.0,19896.0
3,Italy,175925,,23227.0,,44927.0,107771,2733.0,2910.00,384.0,1305833.0,21598.0
4,France,151793,,19323.0,,35983.0,96487,5833.0,2325.00,296.0,463662.0,7103.0
...,...,...,...,...,...,...,...,...,...,...,...,...
209,Anguilla,3,,,,1.0,2,,200.00,,,
210,Saint Pierre Miquelon,1,,,,,1,,173.00,,,
211,Yemen,1,,,,,1,,0.03,,,
212,China,82735,+16,4632.0,,77062.0,1041,85.0,57.00,3.0,,


In [4]:
df[1]

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop
0,World,2330766,+81903,160047.0,+5909,596482.0,1574237,55265.0,299.00,20.5,,
1,China,82719,+27,4632.0,,77029.0,1058,85.0,57.00,3.0,,
2,USA,738792,+29057,39014.0,+1867,68269.0,631509,13551.0,2232.00,118.0,3722145.0,11245.0
3,Spain,194416,+3577,20043.0,+41,74797.0,99576,7371.0,4158.00,429.0,930230.0,19896.0
4,Italy,175925,+3491,23227.0,+482,44927.0,107771,2733.0,2910.00,384.0,1305833.0,21598.0
...,...,...,...,...,...,...,...,...,...,...,...,...
209,Anguilla,3,,,,1.0,2,,200.00,,,
210,Caribbean Netherlands,3,,,,,3,,114.00,,10.0,381.0
211,Saint Pierre Miquelon,1,,,,,1,,173.00,,,
212,Yemen,1,,,,,1,,0.03,,,


WHAT? The same DataFrame twice? The exact replica.

In [5]:
df_virus_data = df[0].loc[:, ["Country,Other", "TotalCases", "TotalDeaths"]]

df_virus_data = df_virus_data.rename(columns={"Country,Other":"Country",
                   "TotalCases":"Cases",
                   "TotalDeaths":"Deaths"})
df_virus_data

Unnamed: 0,Country,Cases,Deaths
0,World,2359332,161950.0
1,USA,740151,39068.0
2,Spain,195944,20453.0
3,Italy,175925,23227.0
4,France,151793,19323.0
...,...,...,...
209,Anguilla,3,
210,Saint Pierre Miquelon,1,
211,Yemen,1,
212,China,82735,4632.0


In [6]:
# Drop the rows for 'World' and 'Total:'
df_virus_data = df_virus_data[df_virus_data['Country']!='World']
df_virus_data = df_virus_data[df_virus_data['Country']!='Total:']                              
df_virus_data

Unnamed: 0,Country,Cases,Deaths
1,USA,740151,39068.0
2,Spain,195944,20453.0
3,Italy,175925,23227.0
4,France,151793,19323.0
5,Germany,144348,4547.0
...,...,...,...
208,South Sudan,4,
209,Anguilla,3,
210,Saint Pierre Miquelon,1,
211,Yemen,1,


In [7]:
# Source: https://simplemaps.com/static/data/world-cities/basic/simplemaps_worldcities_basicv1.6.zip
df_geolocation = pd.read_csv('./worldcities.csv')

In [8]:
# Only primary capitals, otherwise administrative ones e.g., Sao Paulo and Shanghai would be included
df_geolocation = df_geolocation[df_geolocation['capital']=='primary']
df_geolocation

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6850,139.7514,Japan,JP,JPN,Tōkyō,primary,35676000.0,1392685764
2,Mexico City,Mexico City,19.4424,-99.1310,Mexico,MX,MEX,Ciudad de México,primary,19028000.0,1484247881
9,Dhaka,Dhaka,23.7231,90.4086,Bangladesh,BD,BGD,Dhaka,primary,12797394.0,1050529279
10,Buenos Aires,Buenos Aires,-34.6025,-58.3975,Argentina,AR,ARG,"Buenos Aires, Ciudad Autónoma de",primary,12795000.0,1032717330
12,Cairo,Cairo,30.0500,31.2500,Egypt,EG,EGY,Al Qāhirah,primary,11893000.0,1818253931
...,...,...,...,...,...,...,...,...,...,...,...
600,Al Quds,Al Quds,31.7764,35.2269,West Bank,XW,XWB,,primary,,1934000006
601,Philipsburg,Philipsburg,18.0255,-63.0450,Sint Maarten,SX,SXM,,primary,,1534859858
602,Gitega,Gitega,-3.4271,29.9246,Burundi,BI,BDI,Gitega,primary,,1108778000
603,Pristina,Pristina,42.6666,21.1724,Kosovo,XK,XKS,Prishtinë,primary,,1901760068


In [9]:
# Check if each virus country name is indeed included in the geolocation file
virus_country_list = df_virus_data['Country'].tolist()

geolocation_country_list = df_geolocation['country'].tolist()

cities_not_inlcuded_list = []

print("*** NOT included in the geolocation database ***")
for virus_country in virus_country_list:
    if virus_country not in geolocation_country_list:
        cities_not_inlcuded_list.append(virus_country)
        
print(tuple(cities_not_inlcuded_list)) # Taking less space in printing below, rather than print(cities_not_inlcuded_list)
# United States
# United Kingdom
# Korea, South
# United Arab Emirates
# Bosnia And Herzegovina

*** NOT included in the geolocation database ***
('USA', 'UK', 'S. Korea', 'UAE', 'Bosnia and Herzegovina', 'North Macedonia', 'Hong Kong', 'Ivory Coast', 'Diamond Princess', 'Channel Islands', 'Palestine', 'Réunion', 'DRC', 'Isle of Man', 'Mayotte', 'Faeroe Islands', 'Martinique', 'Guadeloupe', 'Congo', 'Gibraltar', 'Trinidad and Tobago', 'Myanmar', 'French Guiana', 'Bermuda', 'Cayman Islands', 'Bahamas', 'French Polynesia', 'Macao', 'Saint Martin', 'Antigua and Barbuda', 'Eswatini', 'New Caledonia', 'Saint Kitts and Nevis', 'CAR', 'St. Vincent Grenadines', 'Turks and Caicos', 'Falkland Islands', 'Greenland', 'Montserrat', 'MS Zaandam', 'Gambia', 'Vatican City', 'St. Barth', 'Western Sahara', 'Caribbean Netherlands', 'British Virgin Islands', 'Sao Tome and Principe', 'Anguilla', 'Saint Pierre Miquelon')


In [10]:
# 'On' keyword preserves the index (1 - 212) of df_data, the caller's Dataframe
df_joined = df_virus_data.join(df_geolocation.set_index('country'), on='Country')
df_joined

Unnamed: 0,Country,Cases,Deaths,city,city_ascii,lat,lng,iso2,iso3,admin_name,capital,population,id
1,USA,740151,39068.0,,,,,,,,,,
2,Spain,195944,20453.0,Madrid,Madrid,40.4000,-3.6834,ES,ESP,Madrid,primary,5567000.0,1.724617e+09
3,Italy,175925,23227.0,Rome,Rome,41.8960,12.4833,IT,ITA,Lazio,primary,3339000.0,1.380383e+09
4,France,151793,19323.0,Paris,Paris,48.8667,2.3333,FR,FRA,Île-de-France,primary,9904000.0,1.250015e+09
5,Germany,144348,4547.0,Berlin,Berlin,52.5218,13.4015,DE,DEU,Berlin,primary,3406000.0,1.276451e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,South Sudan,4,,Juba,Juba,4.8300,31.5800,SS,SSD,Central Equatoria,primary,111975.0,1.728444e+09
209,Anguilla,3,,,,,,,,,,,
210,Saint Pierre Miquelon,1,,,,,,,,,,,
211,Yemen,1,,Sanaa,Sanaa,15.3547,44.2066,YE,YEM,Amānat al ‘Āşimah,primary,2008000.0,1.887751e+09


In [17]:
import datetime as dt

time_stamp =dt.datetime.now().strftime('%Y%m%d%H%M%S%z')

# The name of data file uploaded via gis.content.add (later on) has to be seemingly unique 
joined_data_file_name = 'joined_data'+time_stamp+'.csv'
df_joined.to_csv(joined_data_file_name, index=False)

In [12]:
#import os
#import zipfile
#from copy import deepcopy
#import arcpy
# from arcgis.features import GeoAccessor
import ruamel.yaml as yaml
from arcgis.gis import GIS


with open("arcgis_login.yaml") as stream:
    try:
        arc_gis_login_dict = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print("YAML reading error")

  pd.datetime,


In [13]:
gis = GIS(arc_gis_login_dict["dev_account"]['url'], 
        arc_gis_login_dict["dev_account"]['username'],
        arc_gis_login_dict["dev_account"]['password'])

In [16]:
gis.users.me.folders

[{'username': 'tezzy_o',
  'id': '24dea461778d46e08c66875d785d3be8',
  'title': 'CoronaVirus',
  'created': 1587078553000}]

In [18]:
folder_name = 'CoronaVirus'
file_extension = 'CSV' # Must to be all UPPER cases, not 'csv': Otherwise "Item type not valid."
full_path_name = !pwd # IPython.utils.text.SList
full_path_name = str(full_path_name[0]) + '/'

corona_virus_props = {"title":"Corona_Virus_"+time_stamp,
                   "type":file_extension,
                   "tags":"Corona Virus, World, Cases, Deaths",
                   "snippet":"Corona virus cases and deaths by country.",
                   "description":"Data downloaded from worldometers: COVID-19 \
                   Corona Virus Pandemic https://www.worldometers.info/coronavirus/"}

# Not sure why we need this as the ArcGIS Online does not show Folders at all
if not folder_name in [folder['title'] for folder in gis.users.me.folders]:
    gis.content.create_folder(folder_name)

corona_virus_props_data_item = gis.content.add(item_properties=corona_virus_props, 
                                               data=full_path_name+joined_data_file_name, 
                                               folder=folder_name)
corona_virus_props_item = corona_virus_props_data_item.publish()
corona_virus_props_item.id

Item type not valid.


RuntimeError: Item type not valid.
(Error Code: 400)

In [16]:
source_info = gis.content.analyze(item=corona_virus_props_item.id, file_type=file_extension, location_type='none')
source_info['publishParameters']

Unable to analyze item.
Item type'url' is not supported for anzlyzing, The item needs to be of type file


RuntimeError: Unable to analyze item.
Item type'url' is not supported for anzlyzing, The item needs to be of type file
(Error Code: 400)

In [37]:
df_joined.to_csv('./country_list.csv', index=False)

In [58]:
import arcgis
from arcgis.gis import GIS
from IPython.display import display

gis = GIS() # Connect to ArcGIS Online as an anonymous user
# search_subset = ago_gis.content.search("world") # , item_type = "Feature Layer")
search_subset = ago_gis.content.search('world', outside_org=True)



world_map = gis.map('world', 1)
world_map

layer = gis.content.get('f126c266cae74be4a06873b43684d294')

world_map.add_layer(layer)
# for it in search_subset:
#     display(it)

# subset_item = search_subset[0]
# subset_item

In [64]:
world_map.center = [7.793813, -173.102401] # [latitude, longitude]
world_map.zoom = 2
world_map

MapView(jupyter_target='notebook', layout=Layout(height='400px', width='100%'), ready=True, zoom=2.0)

In [67]:

gis = GIS()
webmap = gis.content.get('41281c51f9de45edaf1c8ed44bb10e30')
webmap

# Import the WebMap class

# from the arcgis.mapping module and visualize the web map.

from arcgis.mapping import WebMap
la_parks_trails = WebMap(webmap)
la_parks_trails



MapView(hide_mode_switch=True, layout=Layout(height='400px', width='100%'))

In [68]:
operational_layers = la_parks_trails.layers
n = len(operational_layers)
print("The webmap has {} layers.".format(n))

The webmap has 4 layers.


In [69]:
for layer in operational_layers:
    print("{}\n\t{}".format(layer['id'], layer['url']))

Trailheads_8053
	https://services3.arcgis.com/GVgbJbqm8hXASVYi/arcgis/rest/services/Trailheads/FeatureServer/0
Trails_7558_8861
	https://services3.arcgis.com/GVgbJbqm8hXASVYi/arcgis/rest/services/Trails/FeatureServer/0
Trails_7558
	https://services3.arcgis.com/GVgbJbqm8hXASVYi/arcgis/rest/services/Trails/FeatureServer/0
Parks_and_Open_Space_8268
	https://services3.arcgis.com/GVgbJbqm8hXASVYi/arcgis/rest/services/Parks_and_Open_Space/FeatureServer/0


In [None]:
# add the initial csv file and publish that as a web layer
item_prop = {'title':'USA Capitals spreadsheet ' + now_ts}
csv_item = gis.content.add(item_properties=item_prop, data=my_csv)
csv_item