In [1]:
# Load library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"
from plotly.subplots import make_subplots

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from math import sqrt

In [2]:
import os
for root, dirs, files in os.walk('/home/gridedge23/Swapnil/Notebook/Covid19_global_forecasting'):
    for filename in files:
        #print(os.path.join(dirname, filename))
        print(filename)

submission.csv
test.csv
Forecast_result.csv
Covid19_Forecasting_Challenge.ipynb
train.csv
Covid19_Forecasting_Challenge-checkpoint.ipynb


In [3]:
# Read the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission = pd.read_csv('submission.csv')

In [4]:
# Check data
print(train_df.shape)
train_df.tail()

(25979, 6)


Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
25974,35647,,Zimbabwe,2020-04-09,11.0,3.0
25975,35648,,Zimbabwe,2020-04-10,13.0,3.0
25976,35649,,Zimbabwe,2020-04-11,14.0,3.0
25977,35650,,Zimbabwe,2020-04-12,14.0,3.0
25978,35651,,Zimbabwe,2020-04-13,17.0,3.0


In [5]:
print(test_df.shape)
test_df.head()

(13459, 4)


Unnamed: 0,ForecastId,Province_State,Country_Region,Date
0,1,,Afghanistan,2020-04-02
1,2,,Afghanistan,2020-04-03
2,3,,Afghanistan,2020-04-04
3,4,,Afghanistan,2020-04-05
4,5,,Afghanistan,2020-04-06


In [6]:
# Check for null values
train_df.isna().sum()

Id                    0
Province_State    14940
Country_Region        0
Date                  0
ConfirmedCases        0
Fatalities            0
dtype: int64

In [7]:
# Check for null values
test_df.isna().sum()

ForecastId           0
Province_State    7740
Country_Region       0
Date                 0
dtype: int64

In [8]:
train_df['Province_State'].unique()

array([nan, 'Australian Capital Territory', 'New South Wales',
       'Northern Territory', 'Queensland', 'South Australia', 'Tasmania',
       'Victoria', 'Western Australia', 'Alberta', 'British Columbia',
       'Manitoba', 'New Brunswick', 'Newfoundland and Labrador',
       'Northwest Territories', 'Nova Scotia', 'Ontario',
       'Prince Edward Island', 'Quebec', 'Saskatchewan', 'Yukon', 'Anhui',
       'Beijing', 'Chongqing', 'Fujian', 'Gansu', 'Guangdong', 'Guangxi',
       'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang', 'Henan', 'Hong Kong',
       'Hubei', 'Hunan', 'Inner Mongolia', 'Jiangsu', 'Jiangxi', 'Jilin',
       'Liaoning', 'Macau', 'Ningxia', 'Qinghai', 'Shaanxi', 'Shandong',
       'Shanghai', 'Shanxi', 'Sichuan', 'Tianjin', 'Tibet', 'Xinjiang',
       'Yunnan', 'Zhejiang', 'Faroe Islands', 'Greenland',
       'French Guiana', 'French Polynesia', 'Guadeloupe', 'Martinique',
       'Mayotte', 'New Caledonia', 'Reunion', 'Saint Barthelemy',
       'Saint Pierre and Miqu

In [9]:
# Combining two data frame
all_data = pd.concat([train_df,test_df],axis=0,sort=False)
#all_data.tail()

# Fill Nan Values
all_data['Province_State'].fillna("None", inplace=True)
all_data['ConfirmedCases'].fillna(0, inplace=True)
all_data['Fatalities'].fillna(0, inplace=True)
all_data['Id'].fillna(-1, inplace=True)
all_data['ForecastId'].fillna(-1, inplace=True)


In [10]:
all_data.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,ForecastId
0,1.0,,Afghanistan,2020-01-22,0.0,0.0,-1.0
1,2.0,,Afghanistan,2020-01-23,0.0,0.0,-1.0
2,3.0,,Afghanistan,2020-01-24,0.0,0.0,-1.0
3,4.0,,Afghanistan,2020-01-25,0.0,0.0,-1.0
4,5.0,,Afghanistan,2020-01-26,0.0,0.0,-1.0


In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#all_data['Province_State'] = le.fit_transform(all_data['Province_State'])
#all_data['Country_Region'] = le.fit_transform(all_data['Country_Region'])

all_data['Date'] = pd.to_datetime(all_data['Date'])
all_data['Day_num'] = le.fit_transform(all_data.Date)
all_data['Day'] = all_data['Date'].dt.day
all_data['Month'] = all_data['Date'].dt.month
all_data['Year'] = all_data['Date'].dt.year



In [12]:
all_data.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,ForecastId,Day_num,Day,Month,Year
0,1.0,,Afghanistan,2020-01-22,0.0,0.0,-1.0,0,22,1,2020
1,2.0,,Afghanistan,2020-01-23,0.0,0.0,-1.0,1,23,1,2020
2,3.0,,Afghanistan,2020-01-24,0.0,0.0,-1.0,2,24,1,2020
3,4.0,,Afghanistan,2020-01-25,0.0,0.0,-1.0,3,25,1,2020
4,5.0,,Afghanistan,2020-01-26,0.0,0.0,-1.0,4,26,1,2020


In [30]:
# Create train and test data
train = all_data[all_data['ForecastId']==-1.0]
test = all_data[all_data['ForecastId']!=-1.0]

In [31]:
print(train.shape)
train.head()

(25979, 11)


Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,ForecastId,Day_num,Day,Month,Year
0,1.0,,Afghanistan,2020-01-22,0.0,0.0,-1.0,0,22,1,2020
1,2.0,,Afghanistan,2020-01-23,0.0,0.0,-1.0,1,23,1,2020
2,3.0,,Afghanistan,2020-01-24,0.0,0.0,-1.0,2,24,1,2020
3,4.0,,Afghanistan,2020-01-25,0.0,0.0,-1.0,3,25,1,2020
4,5.0,,Afghanistan,2020-01-26,0.0,0.0,-1.0,4,26,1,2020


In [94]:
# Total cases over the world 
temp = train.groupby('Date')['ConfirmedCases', 'Fatalities'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['ConfirmedCases', 'Fatalities'],
                 var_name='case', value_name='count')

fig = px.line(temp, x="Date", y="count", color='case',
              title='Total cases over the Date ', color_discrete_sequence = ['cyan', 'red'])
fig.show()

In [90]:
# Maximum confirmed and fatalities case on 2020-04-13
country_max = train.groupby(['Date','Country_Region'])['ConfirmedCases', 'Fatalities'].max().reset_index().sort_values(by='ConfirmedCases',ascending=False).groupby('Country_Region').max().reset_index().sort_values(by='ConfirmedCases',ascending=False)
country_max[:20].style.background_gradient(cmap='viridis_r')

Unnamed: 0,Country_Region,Date,ConfirmedCases,Fatalities
171,US,2020-04-13 00:00:00,195749,10058
156,Spain,2020-04-13 00:00:00,170099,17756
84,Italy,2020-04-13 00:00:00,159516,20465
61,France,2020-04-13 00:00:00,136779,14967
65,Germany,2020-04-13 00:00:00,130072,3194
175,United Kingdom,2020-04-13 00:00:00,88621,11329
80,Iran,2020-04-13 00:00:00,73303,4585
36,China,2020-04-13 00:00:00,67803,3221
170,Turkey,2020-04-13 00:00:00,61049,1296
16,Belgium,2020-04-13 00:00:00,30589,3903


In [32]:
# Getting Top country cases 
Top_country = train.groupby('Country_Region')['ConfirmedCases','Fatalities'].max().reset_index().sort_values(by='ConfirmedCases',ascending=False).head(15)

# confirmed - deaths
fig_c = px.bar(Top_country.sort_values('ConfirmedCases'), x="ConfirmedCases", y="Country_Region", 
               text='ConfirmedCases', orientation='h', color_discrete_sequence = ['cyan'])

fig_d = px.bar(Top_country.sort_values('Fatalities'), x="Fatalities", y="Country_Region", 
               text='Fatalities', orientation='h', color_discrete_sequence = ['red'])


fig = make_subplots(rows=1, cols=2, shared_xaxes=False, horizontal_spacing=0.14, vertical_spacing=0.08,
                    subplot_titles=('Confirmedcases', 'Fatalities'))

fig.add_trace(fig_c['data'][0], row=1, col=1)
fig.add_trace(fig_d['data'][0], row=1, col=2)


In [64]:
# Rise of Confirmed Cases around top 10 countries

countries = Top_country.Country_Region.unique().tolist()
df_plot = train.loc[(train.Country_Region.isin(countries[0:10])) & (train.Date >= '2020-03-01')][['Date', 'Country_Region', 'ConfirmedCases', 'Fatalities']].groupby(['Date', 'Country_Region']).max().reset_index()
df_plot = df_plot.groupby(['Date', 'Country_Region']).sum().sort_values(by='ConfirmedCases', ascending=False).reset_index()
#df_plot

fig = px.bar(df_plot, x="Date", y="ConfirmedCases", color="Country_Region", barmode="stack",)
fig.update_layout(title='Rise of Confirmed Cases around top 10 countries', annotations=[dict(x='2020-03-22', y=150, xref="x", yref="y", text="Corona Rise exponentially from here", showarrow=True, arrowhead=1, ax=-150, ay=-150)])
fig.show()


In [65]:
# Dsitribution over the world

formated_gdf = train.groupby(['Date', 'Country_Region'])['ConfirmedCases', 'Fatalities'].max().reset_index()
formated_gdf['Date'] = formated_gdf['Date'].dt.strftime('%m/%d/%Y')
#formated_gdf['size'] = formated_gdf['ConfirmedCases'].pow(0.3)

fig = px.scatter_geo(formated_gdf, locations="Country_Region", locationmode='country names', 
                     color="ConfirmedCases", hover_name="Country_Region", 
                     range_color= [0, 1500], 
                     projection="natural earth", animation_frame="Date", 
                     title='COVID-19: Spread Over World', color_continuous_scale="portland")

fig.show()

In [18]:
# Apply label encoding
train['Province_State'] = le.fit_transform(train['Province_State'])
train['Country_Region'] = le.fit_transform(train['Country_Region'])

test['Province_State'] = le.fit_transform(test['Province_State'])
test['Country_Region'] = le.fit_transform(test['Country_Region'])

# Creating train data
X = train.drop(columns=['Id','ConfirmedCases','Fatalities','Date','ForecastId'],axis=1)
cases = train.ConfirmedCases
fatalities = train.Fatalities

x_test = test.drop(columns=['Id','ConfirmedCases','Fatalities','Date','ForecastId'],axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [19]:
model = XGBRegressor(n_estimators = 1000 , random_state = 0 , max_depth = 15)
model.fit(X,cases)
cases_pred = model.predict(x_test)

model1 = XGBRegressor(n_estimators = 1000 , random_state = 0 , max_depth = 15)
model1.fit(X,fatalities)
fatalities_pred = model1.predict(x_test)



Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version





In [21]:
model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=15, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [22]:
# Getting Accuracy value
MSE = mean_squared_error(cases.iloc[0:13459],cases_pred)
RMSE = sqrt(mean_squared_error(cases.iloc[0:13459],cases_pred))
MAE = mean_absolute_error(cases.iloc[0:13459],cases_pred)
R2 = r2_score(cases.iloc[0:13459],cases_pred)

print('Mean squared error :', MSE)
print('Root mean squared error :',RMSE)
print('Mean absolute error :', MAE)
print('R squared :',R2)

Mean squared error : 552617901.9467754
Root mean squared error : 23507.826397750505
Mean absolute error : 6668.169310175872
R squared : -6.398942469416935


In [127]:
x_test.shape,test_df.shape

((13459, 6), (13459, 4))

In [23]:
# Predicted Result
test_df_predict = test_df.copy()
test_df_predict['Confirmedcase'] = cases_pred
test_df_predict['Fatalities'] = fatalities_pred
test_df_predict = test_df_predict.drop('Province_State',axis=1)
test_df_predict.to_csv('Forecast_result.csv')

In [134]:
test_df_predict.head(15)

Unnamed: 0,ForecastId,Country_Region,Date,Confirmedcase,Fatalities
0,1,Afghanistan,2020-04-02,272.994507,5.99653
1,2,Afghanistan,2020-04-03,281.001801,6.000327
2,3,Afghanistan,2020-04-04,299.004059,7.001165
3,4,Afghanistan,2020-04-05,349.000305,6.997691
4,5,Afghanistan,2020-04-06,366.999847,11.002952
5,6,Afghanistan,2020-04-07,422.998779,13.998834
6,7,Afghanistan,2020-04-08,444.000305,13.998931
7,8,Afghanistan,2020-04-09,483.999359,15.00174
8,9,Afghanistan,2020-04-10,521.000732,15.000447
9,10,Afghanistan,2020-04-11,554.998596,17.999535


In [124]:
US = test_df_predict[test_df_predict['Country_Region']=='US']
US.groupby('Date')['Confirmedcase','Fatalities'].sum().reset_index()

Unnamed: 0,Date,Confirmedcase,Fatalities
0,2020-04-02,243440.96875,5921.984863
1,2020-04-03,275425.96875,7083.00293
2,2020-04-04,308693.03125,8402.994141
3,2020-04-05,336911.96875,9615.022461
4,2020-04-06,366507.03125,10778.999023
5,2020-04-07,396063.125,12716.993164
6,2020-04-08,428888.9375,14690.001953
7,2020-04-09,461273.9375,16472.986328
8,2020-04-10,496372.1875,18581.021484
9,2020-04-11,526232.875,20457.986328


In [120]:
# Forecasting Comparison by date
temp = test_df_predict.groupby('Date')['Confirmedcase', 'Fatalities'].max().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Confirmedcase', 'Fatalities'],
                 var_name='case', value_name='count')

fig = px.area(temp, x="Date", y="count", color='case',
              title='Forecasting cases over the Date ', color_discrete_sequence = ['cyan', 'red'])
fig.show()

In [119]:
test_df_predict.groupby(['Date','Country_Region'])['Confirmedcase','Fatalities'].max().reset_index().head(10)
                                                                                                    

#test_df_predict.sort_values(by='Confirmedcase',ascending=False).head(10)

Unnamed: 0,Date,Country_Region,Confirmedcase,Fatalities
0,2020-04-02,Afghanistan,272.994507,5.99653
1,2020-04-02,Albania,277.005341,16.003819
2,2020-04-02,Algeria,986.005493,85.998611
3,2020-04-02,Andorra,427.980103,15.007656
4,2020-04-02,Angola,8.00996,1.999878
5,2020-04-02,Antigua and Barbuda,9.011209,0.000715
6,2020-04-02,Argentina,1132.979614,35.988228
7,2020-04-02,Armenia,663.017517,6.999049
8,2020-04-02,Australia,2298.006836,10.001014
9,2020-04-02,Austria,11129.036133,157.995422


In [102]:
test_df_predict.groupby('Country_Region')['Confirmedcase', 'Fatalities'].sum().reset_index().sort_values(by='Confirmedcase',ascending=False).head(15)


Unnamed: 0,Country_Region,Confirmedcase,Fatalities
171,US,26233700.0,899603.9375
156,Spain,7000430.0,721846.3125
84,Italy,6600535.0,842538.4375
61,France,5553830.0,590999.75
65,Germany,5351928.0,124925.53125
36,China,3573037.0,143722.765625
175,United Kingdom,3505410.0,435058.8125
80,Iran,3027100.0,189006.0
170,Turkey,2344600.0,49727.351562
16,Belgium,1224812.0,148175.0


In [103]:
# Top 10 forecast result
top_country = test_df_predict.groupby('Country_Region')['Confirmedcase', 'Fatalities'].max().reset_index().sort_values(by='Confirmedcase',ascending=False).head(15)

# confirmed - Fatalities
fig_c = px.bar(top_country.sort_values('Confirmedcase'), x="Confirmedcase", y="Country_Region", 
               text='Confirmedcase', orientation='h', color_discrete_sequence = ['cyan'])

fig_d = px.bar(top_country.sort_values('Fatalities'), x="Fatalities", y="Country_Region", 
               text='Fatalities', orientation='h', color_discrete_sequence = ['red'])


fig = make_subplots(rows=1, cols=2, shared_xaxes=False, horizontal_spacing=0.14, vertical_spacing=0.08,
                    subplot_titles=('Confirmedcase', 'Fatalities'),)

fig.add_trace(fig_c['data'][0], row=1, col=1)
fig.add_trace(fig_d['data'][0], row=1, col=2)

In [118]:
countries = top_country.Country_Region.unique().tolist()
df_plot = test_df_predict.loc[(test_df_predict.Country_Region.isin(countries[0:10])) & (test_df_predict.Date >= '2020-04-02')] [['Date', 'Country_Region', 'Confirmedcase', 'Fatalities']].groupby(['Date', 'Country_Region']).max().reset_index()
df_plot = df_plot.groupby(['Date', 'Country_Region']).sum().sort_values(by='Confirmedcase', ascending=False).reset_index()
#df_plot

fig = px.bar(df_plot, x="Date", y="Confirmedcase", color="Country_Region", barmode="stack",)
fig.update_layout(title='Top 10 countries Confirmedcase')
fig.show()


In [117]:
countries = top_country.Country_Region.unique().tolist()
df_plot = test_df_predict.loc[(test_df_predict.Country_Region.isin(countries[0:10])) & (test_df_predict.Date >= '2020-04-02')] [['Date', 'Country_Region', 'Confirmedcase', 'Fatalities']].groupby(['Date', 'Country_Region']).max().reset_index()
df_plot = df_plot.groupby(['Date', 'Country_Region']).sum().reset_index()
#df_plot

fig = px.bar(df_plot, x="Date", y="Fatalities", color="Country_Region", barmode="stack")
fig.update_layout(title='Top 10 countries Fatalities')
fig.show()


In [240]:
# Appending result to submission file
cases_pred = [round(value) for value in cases_pred ]
fatalities_pred = [round(value) for value in fatalities_pred ]

submission['ConfirmedCases'] = cases_pred
submission['Fatalities'] = fatalities_pred
submission.to_csv('submission.csv')