In [87]:
import pandas as pd
import numpy as np
import pycountry 
import pycountry_convert as pc
import plotly.express as px
import plotly
%matplotlib inline

In [88]:
path = 'data/kaggle/week4/'
data = pd.read_csv(path + 'train.csv',
                   parse_dates=['Date'],
                   dtype={
                       'Id': 'int',
                       'Province_State': 'str',
                       'Country_Region': 'str',
                       'ConfirmedCases': 'float',
                       'Fatalities': 'float'
                   })
data.replace([np.inf, -np.inf], np.nan)
print(data.shape)
data.sample(n=10).head(10)

(24727, 6)


Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
3173,4574,British Columbia,Canada,2020-02-04,1.0,0.0
10307,14858,,Grenada,2020-02-28,0.0,0.0
16505,23786,,Saint Vincent and the Grenadines,2020-04-04,7.0,0.0
23660,34126,Falkland Islands (Malvinas),United Kingdom,2020-03-01,0.0,0.0
9498,13699,Saint Barthelemy,France,2020-02-09,0.0,0.0
24337,35118,,Vietnam,2020-01-27,2.0,0.0
17878,25789,,Sweden,2020-02-15,1.0,0.0
19158,27629,Colorado,US,2020-03-02,0.0,0.0
11221,16192,,Iran,2020-01-25,0.0,0.0
13084,18860,,Madagascar,2020-03-11,0.0,0.0


In [89]:
country_data = data.groupby(['Country_Region', 'Date'], as_index=False)['ConfirmedCases', 'Fatalities'].sum()

In [90]:
d = {}
def get_iso(country):
        if country in d:
            return d[country]
        try:
            o = pycountry.countries.get(name=country)
            if o is None:
                c = pycountry.countries.search_fuzzy(country)
                o = c[0]
            d[country] = o.alpha_3
            
            return o.alpha_3
        except:
            if 'Congo' in country:
                country = 'Congo'
            elif country == 'Korea, South' or country == 'South Korea':
                country = 'Korea, Republic of'
            elif country == 'Taiwan*':
                country = 'Taiwan'
            elif country == 'Burma':
                country = 'Myanmar'
            elif country == 'West Bank and Gaza':
                country = 'Gaza'
            else:
                return country
            os = pycountry.countries.search_fuzzy(country)
            d[country] = os[0].alpha_3
            
            return os[0].alpha_3       

In [91]:
country_data['iso'] = country_data['Country_Region'].apply(lambda c: get_iso(c))
country_data['Date'] = country_data['Date'].astype(str)

In [95]:
#get_iso() is slow
country_data.to_feather('data/country_data')
#country_data = pd.read_feather('data/country_data')
country_data

Unnamed: 0,Country_Region,Date,ConfirmedCases,Fatalities,iso,ln(ConfirmedCases),ln(Fatalitites)
0,Afghanistan,2020-01-22,0.0,0.0,AFG,0.000000,0.000000
1,Afghanistan,2020-01-23,0.0,0.0,AFG,0.000000,0.000000
2,Afghanistan,2020-01-24,0.0,0.0,AFG,0.000000,0.000000
3,Afghanistan,2020-01-25,0.0,0.0,AFG,0.000000,0.000000
4,Afghanistan,2020-01-26,0.0,0.0,AFG,0.000000,0.000000
...,...,...,...,...,...,...,...
14531,Zimbabwe,2020-04-05,9.0,1.0,ZWE,2.302585,0.693147
14532,Zimbabwe,2020-04-06,10.0,1.0,ZWE,2.397895,0.693147
14533,Zimbabwe,2020-04-07,11.0,2.0,ZWE,2.484907,1.098612
14534,Zimbabwe,2020-04-08,11.0,3.0,ZWE,2.484907,1.386294


In [96]:
country_data['ln(ConfirmedCases)'] = np.log(country_data['ConfirmedCases']+1)
country_data['ln(Fatalitites)'] = np.log(country_data['Fatalities']+1)

# Visualization
* Choropleth map
* plotlines 

hover over parts of the vis to see details

### By Country

In [97]:
px.choropleth(country_data, 
              locations="iso", 
              color="ln(ConfirmedCases)", 
              hover_name="Country_Region", 
              hover_data=["ConfirmedCases", 'Fatalities'] ,
              animation_frame="Date",
              color_continuous_scale=px.colors.sequential.tempo, 
              title='Total ln(Confirmed Cases) growth World Map')

In [19]:
top_10_country = country_data[country_data['Date']=='2020-04-09'].nlargest(10, 'ConfirmedCases')['Country_Region']
top_10_country = list(top_10_country)

In [21]:
px.line(country_data[country_data['Country_Region'].isin(top_10_country)], 
            x='Date', y='ln(ConfirmedCases)', color='Country_Region', hover_data=['Fatalities'])

## North America

In [83]:
import datetime
north_df = data[data['Country_Region'].isin(['Canada', 'US'])]
north_df_last = data[data['Date'] > datetime.datetime(2020, 3, 15)]
north_df_last

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
54,55,,Afghanistan,2020-03-16,21.0,0.0
55,56,,Afghanistan,2020-03-17,22.0,0.0
56,57,,Afghanistan,2020-03-18,22.0,0.0
57,58,,Afghanistan,2020-03-19,22.0,0.0
58,59,,Afghanistan,2020-03-20,24.0,0.0
...,...,...,...,...,...,...
24722,35643,,Zimbabwe,2020-04-05,9.0,1.0
24723,35644,,Zimbabwe,2020-04-06,10.0,1.0
24724,35645,,Zimbabwe,2020-04-07,11.0,2.0
24725,35646,,Zimbabwe,2020-04-08,11.0,3.0


### Canada 

In [85]:
provinces_data_last = north_df_last.groupby(['Date', 'Country_Region', 'Province_State'],
                                 as_index=False)['ConfirmedCases', 'Fatalities'].sum()
rovinces_data = north_df.groupby(['Date', 'Country_Region', 'Province_State'],
                                 as_index=False)['ConfirmedCases', 'Fatalities'].sum()
can_df = provinces_data_last[provinces_data['Country_Region'] == 'Canada']
can_df.tail(10)

Unnamed: 0,Date,Country_Region,Province_State,ConfirmedCases,Fatalities
3202,2020-04-09,Canada,Manitoba,221.0,3.0
3203,2020-04-09,Canada,New Brunswick,108.0,0.0
3204,2020-04-09,Canada,Newfoundland and Labrador,232.0,2.0
3205,2020-04-09,Canada,Northwest Territories,5.0,0.0
3206,2020-04-09,Canada,Nova Scotia,342.0,1.0
3207,2020-04-09,Canada,Ontario,5759.0,200.0
3208,2020-04-09,Canada,Prince Edward Island,25.0,0.0
3209,2020-04-09,Canada,Quebec,10912.0,216.0
3210,2020-04-09,Canada,Saskatchewan,271.0,3.0
3211,2020-04-09,Canada,Yukon,7.0,0.0


In [86]:
px.line(can_df, x='Date', y='ConfirmedCases', color='Province_State', hover_data=['Fatalities'])

In [81]:
import plotly.graph_objects as go


fig = go.Figure(data=go.Choropleth(
    locations=df['code'], # Spatial coordinates
    z = df['total exports'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
    colorbar_title = "Millions USD",
))

fig.update_layout(
    title_text = '2011 US Agriculture Exports by State',
    geo_scope='usa', # limite map scope to USA
)

fig.show()

px.choropleth(country_data, 
              locations=, 
              color="ln(ConfirmedCases)", 
              hover_name="Country_Region", 
              hover_data=["ConfirmedCases", 'Fatalities'] ,
              animation_frame="Date",
              color_continuous_scale=px.colors.sequential.tempo, 
              title='Total ln(Confirmed Cases) growth World Map')

In [82]:
provinces_data['Date'][:58]

0    2020-03-16
1    2020-03-16
2    2020-03-16
3    2020-03-16
4    2020-03-16
5    2020-03-16
6    2020-03-16
7    2020-03-16
8    2020-03-16
9    2020-03-16
10   2020-03-16
11   2020-03-16
12   2020-03-16
13   2020-03-16
14   2020-03-16
15   2020-03-16
16   2020-03-16
17   2020-03-16
18   2020-03-16
19   2020-03-16
20   2020-03-16
21   2020-03-16
22   2020-03-16
23   2020-03-16
24   2020-03-16
25   2020-03-16
26   2020-03-16
27   2020-03-16
28   2020-03-16
29   2020-03-16
30   2020-03-16
31   2020-03-16
32   2020-03-16
33   2020-03-16
34   2020-03-16
35   2020-03-16
36   2020-03-16
37   2020-03-16
38   2020-03-16
39   2020-03-16
40   2020-03-16
41   2020-03-16
42   2020-03-16
43   2020-03-16
44   2020-03-16
45   2020-03-16
46   2020-03-16
47   2020-03-16
48   2020-03-16
49   2020-03-16
50   2020-03-16
51   2020-03-16
52   2020-03-16
53   2020-03-16
54   2020-03-16
55   2020-03-16
56   2020-03-16
57   2020-03-16
Name: Date, dtype: datetime64[ns]

### US

In [None]:
states_code = {}
for i 

In [75]:
px.choropleth(country_data, 
              locations="iso", 
              color="ln(ConfirmedCases)", 
              hover_name="Country_Region", 
              hover_data=["ConfirmedCases", 'Fatalities'] ,
              animation_frame="Date",
              color_continuous_scale=px.colors.sequential.tempo, 
              title='Total ln(Confirmed Cases) growth World Map')

In [71]:
a, b = set(us_df), set(df['state'])
a - b

{'District of Columbia', 'Guam', 'Puerto Rico', 'Virgin Islands'}

In [72]:
b - a

set()

In [73]:
len(a)

54

In [74]:
len(b)

50