In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from math import radians, cos, sin, asin, sqrt
import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
locator = Nominatim(user_agent="myGeocoder",timeout=20)

In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

Read files

In [3]:
# Get the paths where all the files are stored
current_path = os.getcwd()
train_path = current_path + '/train.csv'
test_path = current_path + '/test.csv'

In [4]:
# read csv files
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
print('shape of train data:',train_df.shape)
print('shape of test data:',test_df.shape)

shape of train data: (16756, 8)
shape of test data: (12212, 6)


In [5]:
train_df.head()

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0
1,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0
2,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0
3,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0
4,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0


In [6]:
test_df['Date'].value_counts()

2020-03-22    284
2020-03-24    284
2020-03-28    284
2020-04-15    284
2020-03-21    284
2020-04-08    284
2020-04-13    284
2020-04-02    284
2020-03-12    284
2020-03-16    284
2020-03-23    284
2020-04-06    284
2020-04-05    284
2020-04-21    284
2020-04-11    284
2020-03-31    284
2020-04-14    284
2020-04-04    284
2020-04-07    284
2020-04-17    284
2020-04-19    284
2020-03-19    284
2020-04-10    284
2020-03-20    284
2020-03-17    284
2020-04-12    284
2020-04-03    284
2020-04-22    284
2020-04-20    284
2020-03-14    284
2020-03-25    284
2020-03-29    284
2020-03-30    284
2020-04-01    284
2020-04-23    284
2020-03-27    284
2020-04-09    284
2020-03-26    284
2020-03-13    284
2020-04-18    284
2020-03-18    284
2020-03-15    284
2020-04-16    284
Name: Date, dtype: int64

Perform data transformation

In [7]:
# Convert confirmed cases and fatalities columns to int as they are counts
train_df['ConfirmedCases'] = train_df['ConfirmedCases'].astype(int)
train_df['Fatalities'] = train_df['Fatalities'].astype(int)

In [8]:
# Convert string date time to datetime object
train_df['Modified_Date'] = pd.to_datetime(train_df['Date'])

In [9]:
# Extract month from datetime object
train_df["month"] = train_df['Modified_Date'].map(lambda x: x.month)

In [10]:
train_df

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities,Modified_Date,month
0,1,,Afghanistan,33.0000,65.0000,2020-01-22,0,0,2020-01-22,1
1,2,,Afghanistan,33.0000,65.0000,2020-01-23,0,0,2020-01-23,1
2,3,,Afghanistan,33.0000,65.0000,2020-01-24,0,0,2020-01-24,1
3,4,,Afghanistan,33.0000,65.0000,2020-01-25,0,0,2020-01-25,1
4,5,,Afghanistan,33.0000,65.0000,2020-01-26,0,0,2020-01-26,1
...,...,...,...,...,...,...,...,...,...,...
16751,26374,,Zambia,-15.4167,28.2833,2020-03-16,0,0,2020-03-16,3
16752,26375,,Zambia,-15.4167,28.2833,2020-03-17,0,0,2020-03-17,3
16753,26376,,Zambia,-15.4167,28.2833,2020-03-18,2,0,2020-03-18,3
16754,26377,,Zambia,-15.4167,28.2833,2020-03-19,2,0,2020-03-19,3


In [21]:
train_df.isnull().sum()

Id                   0
Province/State    9086
Country/Region       0
Lat                 59
Long                59
Date                 0
ConfirmedCases       0
Fatalities           0
Modified_Date        0
month                0
geom                 0
new_state         9086
dtype: int64

In [11]:
import ssl 

# Disable SSL certificate verification
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

In [12]:
# Concatenate latitude and longitude feature together
train_df["geom"] = train_df["Lat"].map(str) + ',' + train_df["Long"].map(str)

In [58]:
# Get new state for dataframe
train_df['new_state'] = train_df['Province/State']

In [14]:
# Get dataframe with null states
null_state_df = train_df[train_df['Province/State'].isnull()==True]
# get all the unique geom from null province dataframe
unique_geom = null_state_df['geom'].unique()

In [72]:
#Get the not null values for province/state
not_null_state_df = train_df[train_df['Province/State'].isnull()==False]
print("Shape of the not null states dataframe",not_null_state_df.shape)
print("Shape of the original dataframe",train_df.shape)
print("Null values of the not null states dataframe",not_null_state_df.isnull().sum())

Shape of the not null states dataframe (7670, 12)
Shape of the original dataframe (16756, 12)
Null values of the not null states dataframe Id                0
Province/State    0
Country/Region    0
Lat               0
Long              0
Date              0
ConfirmedCases    0
Fatalities        0
Modified_Date     0
month             0
geom              0
new_state         0
dtype: int64


In [56]:
not_null_state_df["new_state"]  

Index(['Id', 'Province/State', 'Country/Region', 'Lat', 'Long', 'Date',
       'ConfirmedCases', 'Fatalities', 'Modified_Date', 'month', 'geom',
       'new_state'],
      dtype='object')

In [15]:
train_df['geom'].nunique()

276

In [16]:
len(unique_geom)

153

In [17]:
# Get state from the given latitude and longitude
def get_state(coordinates):
    """
    Get states from the given coordinates.
    If the location obtained from the service is None or
    if the inputs are nan's then output is None
    
    Input Args:
    coordinates(str): String with format '<lat>,<long>'
    
    Output Args:
    state(str): String value consisting of state name or Nones
    """
    state = None
    if coordinates == 'nan,nan':
        return state
    location = locator.reverse(coordinates,language='en')
    # If the location is found from the geoenconder
    if location[0] is not None:
        state = location.raw['address'].get('state','')
    return state

In [25]:
# create dictionary of geom to new state using geoencoder
geom_to_state_dict = {}
for i,geom in enumerate(unique_geom):
    state = get_state(geom)
    # If state returned is empty string then default it to None
    if state == "":
        state = None
    geom_to_state_dict[geom] = state

In [26]:
geom_to_state_dict

{'33.0,65.0': 'Helmand',
 '41.1533,20.1683': 'Central Albania',
 '28.0339,1.6596': 'Adrar',
 '42.5063,1.5218': None,
 '17.0608,-61.7964': None,
 '-38.4161,-63.6167': 'La Pampa',
 '40.0691,45.0382': 'Ararat Province',
 'nan,nan': None,
 '47.5162,14.5501': 'Styria',
 '40.1431,47.5769': 'Aghjabadi District',
 '26.0275,50.55': 'Southern Governorate',
 '23.685,90.3563': 'Dhaka Division',
 '13.1939,-59.5432': None,
 '53.7098,27.9534': 'Minsk Region',
 '50.8333,4.0': 'Flanders',
 '9.3077,2.3158': 'Borgou',
 '27.5142,90.4336': 'Trongsa District',
 '-16.2902,-63.5887': 'Santa Cruz',
 '43.9159,17.6791': 'Federation of Bosnia and Herzegovina',
 '-14.235,-51.9253': 'Mato Grosso',
 '4.5353,114.7277': 'Tutong District',
 '42.7339,25.4858': 'Stara Zagora',
 '12.2383,-1.5616': None,
 '11.55,104.9167': 'Phnom Penh',
 '3.8480000000000003,11.5021': 'Centre',
 '6.6111,20.9394': 'Ouaka',
 '-35.6751,-71.543': 'Maule Region',
 '4.5709,-74.2973': 'Cundinamarca',
 '-4.0383,21.7587': 'Kasai',
 '9.7489,-83.7534'

In [27]:
null_state_df['new_state'] = null_state_df['geom'].apply(lambda x:geom_to_state_dict[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
# Apply tansformations from dictionary to geom column
null_state_df

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities,Modified_Date,month,geom,new_state
0,1,,Afghanistan,33.0000,65.0000,2020-01-22,0,0,2020-01-22,1,"33.0,65.0",Helmand
1,2,,Afghanistan,33.0000,65.0000,2020-01-23,0,0,2020-01-23,1,"33.0,65.0",Helmand
2,3,,Afghanistan,33.0000,65.0000,2020-01-24,0,0,2020-01-24,1,"33.0,65.0",Helmand
3,4,,Afghanistan,33.0000,65.0000,2020-01-25,0,0,2020-01-25,1,"33.0,65.0",Helmand
4,5,,Afghanistan,33.0000,65.0000,2020-01-26,0,0,2020-01-26,1,"33.0,65.0",Helmand
...,...,...,...,...,...,...,...,...,...,...,...,...
16751,26374,,Zambia,-15.4167,28.2833,2020-03-16,0,0,2020-03-16,3,"-15.4167,28.2833",Lusaka Province
16752,26375,,Zambia,-15.4167,28.2833,2020-03-17,0,0,2020-03-17,3,"-15.4167,28.2833",Lusaka Province
16753,26376,,Zambia,-15.4167,28.2833,2020-03-18,2,0,2020-03-18,3,"-15.4167,28.2833",Lusaka Province
16754,26377,,Zambia,-15.4167,28.2833,2020-03-19,2,0,2020-03-19,3,"-15.4167,28.2833",Lusaka Province


In [52]:
null_state_df.isnull().sum()

Id                   0
Province/State    9086
Country/Region       0
Lat                 59
Long                59
Date                 0
ConfirmedCases       0
Fatalities           0
Modified_Date        0
month                0
geom                 0
new_state         1770
dtype: int64

In [44]:
null_state_df['new_state'].unique()

array(['Helmand', 'Central Albania', 'Adrar', None, 'La Pampa',
       'Ararat Province', 'Styria', 'Aghjabadi District',
       'Southern Governorate', 'Dhaka Division', 'Minsk Region',
       'Flanders', 'Borgou', 'Trongsa District', 'Santa Cruz',
       'Federation of Bosnia and Herzegovina', 'Mato Grosso',
       'Tutong District', 'Stara Zagora', 'Phnom Penh', 'Centre', 'Ouaka',
       'Maule Region', 'Cundinamarca', 'Kasai', 'Cartago Province',
       'Vallée du Bandama', 'Villa Clara', 'Cyprus', 'Southeast',
       'Tadjourah', 'Monte Plata', 'Morona Santiago',
       'New Valley Governorate', 'Litoral', 'Manzini', 'Oromia Region',
       'Mainland Finland', 'French Guiana', 'Ogooué-Lolo', 'Lower River',
       'Imereti', 'Hesse', 'Bono East Region',
       'Thessaly - Central Greece', 'Guadeloupe', 'Guam', 'Alta Verapaz',
       'Guernsey', 'Kankan', 'Upper Demerara-Berbice', 'Olancho',
       'Maharashtra', 'Central Kalimantan', 'Isfahan Province',
       'Al Anbar', 'South Di

In [73]:
# Merge the null and not null dataframe
train_df_2 = null_state_df.append(not_null_state_df)
print("Shape of the appended dataframe",train_df_2.shape)
print("Null values of the appended dataframe",train_df_2.isnull().sum())

Shape of the appended dataframe (16756, 12)
Null values of the appended dataframe Id                   0
Province/State    9086
Country/Region       0
Lat                 59
Long                59
Date                 0
ConfirmedCases       0
Fatalities           0
Modified_Date        0
month                0
geom                 0
new_state         1770
dtype: int64


In [79]:
# Remove the rows with null new_state columns
final_df = train_df_2[train_df_2['new_state'].isnull()==False]

In [81]:
final_df.isnull().sum()

Id                   0
Province/State    7316
Country/Region       0
Lat                  0
Long                 0
Date                 0
ConfirmedCases       0
Fatalities           0
Modified_Date        0
month                0
geom                 0
new_state            0
dtype: int64

In [82]:
unique_country = final_df['Country/Region'].unique()
unique_state = final_df['new_state'].unique()

In [83]:
# Create a dictionary to tag string columns to numbers for countries column
unique_country_to_index_dict = {}
index_to_unique_country_dict = {}
for i, val in enumerate(unique_country):
    unique_country_to_index_dict[val] = i
    index_to_unique_country_dict[i] = val

In [84]:
# Create a dictionary to tag string columns to numbers for state column
unique_state_to_index_dict = {}
index_to_unique_state_dict = {}
for i, val in enumerate(unique_state):
    unique_state_to_index_dict[val] = i
    index_to_unique_state_dict[i] = val

In [86]:
# Apply the transformations from dictionary to columns
final_df['country_index'] = final_df['Country/Region'].apply(lambda x:unique_country_to_index_dict[x])
final_df['state_index'] = final_df['new_state'].apply(lambda x:unique_state_to_index_dict[x])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [87]:
def single_pt_haversine(lat, lng, degrees=True):
    """
    'Single-point' Haversine: Calculates the great circle distance
    between a point on Earth and the (0, 0) lat-long coordinate
    """
    r = 6371 # Earth's radius (km). Have r = 3956 if you want miles

    # Convert decimal degrees to radians
    if degrees:
        lat, lng = map(radians, [lat, lng])

    # 'Single-point' Haversine formula
    a = sin(lat/2)**2 + cos(lat) * sin(lng/2)**2
    # Modifying this part and removing the scaling factor
    # d = 2 * r * asin(sqrt(a)) 
    d = asin(sqrt(a)) 

    return d

In [92]:
# Convert lat long feature to a single feature and fill nan values in the subsequent column
#train_df['lat-long']= [str(x) + '' + str(y) for x, y in zip(train_df['Lat'], train_df['Long'])]
final_df['harvesine-lat-long'] = [single_pt_haversine(x, y) for x, y in zip(final_df['Lat'], final_df['Long'])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [99]:
final_df

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities,Modified_Date,month,geom,new_state,country_index,state_index,harvesine-lat-long
0,1,,Afghanistan,33.0000,65.000,2020-01-22,0,0,2020-01-22,1,"33.0,65.0",Helmand,0,0,0.604242
1,2,,Afghanistan,33.0000,65.000,2020-01-23,0,0,2020-01-23,1,"33.0,65.0",Helmand,0,0,0.604242
2,3,,Afghanistan,33.0000,65.000,2020-01-24,0,0,2020-01-24,1,"33.0,65.0",Helmand,0,0,0.604242
3,4,,Afghanistan,33.0000,65.000,2020-01-25,0,0,2020-01-25,1,"33.0,65.0",Helmand,0,0,0.604242
4,5,,Afghanistan,33.0000,65.000,2020-01-26,0,0,2020-01-26,1,"33.0,65.0",Helmand,0,0,0.604242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16456,25909,United Kingdom,United Kingdom,55.3781,-3.436,2020-03-16,1543,55,2020-03-16,3,"55.3781,-3.4360000000000004",United Kingdom,132,245,0.483885
16457,25910,United Kingdom,United Kingdom,55.3781,-3.436,2020-03-17,1950,55,2020-03-17,3,"55.3781,-3.4360000000000004",United Kingdom,132,245,0.483885
16458,25911,United Kingdom,United Kingdom,55.3781,-3.436,2020-03-18,2626,71,2020-03-18,3,"55.3781,-3.4360000000000004",United Kingdom,132,245,0.483885
16459,25912,United Kingdom,United Kingdom,55.3781,-3.436,2020-03-19,2689,137,2020-03-19,3,"55.3781,-3.4360000000000004",United Kingdom,132,245,0.483885


In [93]:
final_df['harvesine-lat-long'].unique()

array([0.60424194, 0.39292762, 0.24503553, 0.60758611, 0.49972456,
       0.42915248, 0.51449234, 0.48155829, 0.78824556, 0.51031987,
       0.44459561, 0.08367982, 0.78875406, 0.56483729, 0.40717739,
       0.46505044, 1.00046728, 0.42296247, 0.91287568, 0.10577109,
       0.19123107, 0.65534665, 0.64881328, 0.19296686, 0.73167655,
       0.08160416, 0.70454435, 0.40975852, 0.44985377, 0.38307348,
       0.62181112, 0.6823323 , 0.33935877, 0.08823287, 0.35128376,
       0.36071964, 0.58290544, 0.46449317, 0.10155028, 0.1768694 ,
       0.50157571, 0.45002397, 0.06991513, 0.38300007, 0.5479141 ,
       1.24464696, 0.78733633, 0.43196613, 0.12091711, 0.51384422,
       0.75375142, 0.68772704, 0.99412984, 0.51759178, 0.46158872,
       0.39619164, 0.38682008, 1.10790965, 0.40657689, 0.65276282,
       0.33079405, 1.04609009, 0.47283588, 0.68589706, 0.52544774,
       0.41625477, 0.09944621, 0.5106525 , 0.9815506 , 0.63920662,
       0.33460888, 0.54148401, 0.52114042, 0.40640765, 0.88562

In [94]:
final_df[final_df['harvesine-lat-long'].isnull()==True]

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities,Modified_Date,month,geom,new_state,country_index,state_index,harvesine-lat-long


In [95]:
# Extract data based on months
month1_df = final_df[final_df['month'] == 1]
month2_df = final_df[final_df['month'] == 2]
month3_df = final_df[final_df['month'] == 3]

In [96]:
month1_df

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities,Modified_Date,month,geom,new_state,country_index,state_index,harvesine-lat-long
0,1,,Afghanistan,33.0000,65.000,2020-01-22,0,0,2020-01-22,1,"33.0,65.0",Helmand,0,0,0.604242
1,2,,Afghanistan,33.0000,65.000,2020-01-23,0,0,2020-01-23,1,"33.0,65.0",Helmand,0,0,0.604242
2,3,,Afghanistan,33.0000,65.000,2020-01-24,0,0,2020-01-24,1,"33.0,65.0",Helmand,0,0,0.604242
3,4,,Afghanistan,33.0000,65.000,2020-01-25,0,0,2020-01-25,1,"33.0,65.0",Helmand,0,0,0.604242
4,5,,Afghanistan,33.0000,65.000,2020-01-26,0,0,2020-01-26,1,"33.0,65.0",Helmand,0,0,0.604242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16407,25860,United Kingdom,United Kingdom,55.3781,-3.436,2020-01-27,0,0,2020-01-27,1,"55.3781,-3.4360000000000004",United Kingdom,132,245,0.483885
16408,25861,United Kingdom,United Kingdom,55.3781,-3.436,2020-01-28,0,0,2020-01-28,1,"55.3781,-3.4360000000000004",United Kingdom,132,245,0.483885
16409,25862,United Kingdom,United Kingdom,55.3781,-3.436,2020-01-29,0,0,2020-01-29,1,"55.3781,-3.4360000000000004",United Kingdom,132,245,0.483885
16410,25863,United Kingdom,United Kingdom,55.3781,-3.436,2020-01-30,0,0,2020-01-30,1,"55.3781,-3.4360000000000004",United Kingdom,132,245,0.483885


In [97]:
month3_df[month3_df['ConfirmedCases']>0]['Country/Region'].value_counts()

China                       660
US                          585
Australia                   140
Canada                      126
France                       91
United Kingdom               59
Denmark                      37
Netherlands                  35
Korea, South                 20
Nepal                        20
New Zealand                  20
Russia                       20
Cambodia                     20
Thailand                     20
India                        20
Finland                      20
Georgia                      20
Cruise Ship                  20
Czechia                      20
Qatar                        20
Belarus                      20
Afghanistan                  20
Greece                       20
Vietnam                      20
Belgium                      20
Spain                        20
Switzerland                  20
Italy                        20
Algeria                      20
Nigeria                      20
Oman                         20
Azerbaij

In [98]:
month1_df['Country/Region'].value_counts()

US                          580
China                       330
Canada                      110
Australia                    90
France                       80
United Kingdom               50
Netherlands                  30
Denmark                      20
Uruguay                      10
Serbia                       10
North Macedonia              10
Guatemala                    10
The Gambia                   10
Nigeria                      10
Lebanon                      10
Portugal                     10
Ecuador                      10
French Guiana                10
Israel                       10
Japan                        10
Chile                        10
Tanzania                     10
India                        10
Moldova                      10
Jordan                       10
Saint Lucia                  10
Ghana                        10
Finland                      10
Tunisia                      10
Maldives                     10
Guadeloupe                   10
Somalia 