In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = 'plotly_dark'
from plotly.subplots import make_subplots
from datetime import datetime


In [7]:
train = pd.read_csv('train.csv', parse_dates=['Date'])
test = pd.read_csv('test.csv', parse_dates=['Date'])
cleaned = pd.read_csv('covid_19_clean_complete.csv', parse_dates=['Date'])

In [3]:
train.head()

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0
1,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0
2,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0
3,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0
4,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0


In [8]:
cleaned.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Thailand,15.0,101.0,2020-01-22,2.0,0.0,0.0
1,,Japan,36.0,138.0,2020-01-22,2.0,0.0,0.0
2,,Singapore,1.2833,103.8333,2020-01-22,0.0,0.0,0.0
3,,Nepal,28.1667,84.25,2020-01-22,0.0,0.0,0.0
4,,Malaysia,2.5,112.5,2020-01-22,0.0,0.0,0.0


In [9]:
train.rename(columns={
    'Id': 'id',
    'Province/State': 'province',
    'Country/Region': 'country',
    'Lat': 'lat', 
    'Long': 'long',
    'Date': 'date',
    'ConfirmedCases': 'confirmed',
    'Fatalities': 'deaths'
}, inplace=True)

cleaned.rename(columns={
    'Id': 'id',
    'Province/State': 'province',
    'Country/Region': 'country',
    'Lat': 'lat',
    'Long': 'long',
    'Date': 'date',
    'Confirmed': 'confirmed',
    'Deaths': 'deaths',
    'Recovered': 'recovered'
}, inplace=True)

In [10]:
train.head()

Unnamed: 0,id,province,country,lat,long,date,confirmed,deaths
0,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0
1,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0
2,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0
3,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0
4,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0


In [11]:
cleaned.head()

Unnamed: 0,province,country,lat,long,date,confirmed,deaths,recovered
0,,Thailand,15.0,101.0,2020-01-22,2.0,0.0,0.0
1,,Japan,36.0,138.0,2020-01-22,2.0,0.0,0.0
2,,Singapore,1.2833,103.8333,2020-01-22,0.0,0.0,0.0
3,,Nepal,28.1667,84.25,2020-01-22,0.0,0.0,0.0
4,,Malaysia,2.5,112.5,2020-01-22,0.0,0.0,0.0


In [32]:
train.drop(['id'], axis=1)

train['province'] = train['province'].fillna(' ')
train['recovered'] = cleaned['recovered']
train['active'] = train['confirmed'] - train['deaths'] - train['recovered']
case_group = ['confirmed', 'active', 'deaths', 'recovered']
train[case_group] = train[case_group].fillna(0)

In [19]:
train.head()

Unnamed: 0,id,province,country,lat,long,date,confirmed,deaths,active,recovered
0,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0,0.0,0.0
1,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0,0.0,0.0
2,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0,0.0,0.0
3,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0,0.0,0.0
4,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0,0.0,0.0


In [20]:
train['country'].nunique()

163

In [21]:
train[train['province'] != ' ']['country'].unique()

array(['Australia', 'Canada', 'China', 'Cruise Ship', 'Denmark', 'France',
       'Netherlands', 'US', 'United Kingdom'], dtype=object)

In [None]:
total_confirmed = train.groupby('date')['date', 'confirmed', 'deaths'].sum().reset_index()
china_total = train[train['country'] == 'China'].reset_index()
china_group = china_total.groupby('date')['date', 'confirmed', 'deaths'].sum().reset_index()
rest = train[~train['country'].isin(['China'])].reset_index()
rest_group = rest.groupby('date')['date', 'confirmed', 'deaths'].sum().reset_index()

In [None]:
china_group

In [None]:
rest_group

In [None]:
fig = px.line(total_confirmed, x='date', y='confirmed', title='World confirmed over time')
fig.show()

fig = px.line(rest_group, x='date', y='confirmed', title='Rest of world except china')
fig.show()

fig = px.line(china_group, x='date', y='confirmed', title='China')
fig.show()

In [22]:
world_total = train[train['date'] == max(train['date'])]
world_total_group = world_total.groupby('country')['confirmed', 'deaths'].sum().reset_index()

In [23]:
fig = px.bar(
    world_total_group.sort_values('confirmed', ascending=False)[:15][::-1], 
    x='confirmed', 
    y='country', 
    orientation='h', 
    text='confirmed', 
    title='World infection in order')

fig.show()

In [34]:
cleaned['active'] = cleaned['confirmed'] - cleaned['deaths'] - cleaned['recovered']

cleaned[case_group] = cleaned[case_group].fillna(0)

In [35]:
cases = cleaned.groupby('date')['confirmed', 'active', 'deaths', 'recovered'].sum().reset_index()

fig = make_subplots(rows=1, cols=4, subplot_titles=('Confirmed', 'Active', 'Deaths', 'Recovered'))

trace1 = go.Scatter(
    x=cases['date'],
    y=cases['confirmed'],
    name='Confirmed',
    line_color='blue',
    mode='lines+markers')
trace2 = go.Scatter(
    x=cases['date'],
    y=cases['active'],
    name='Active',
    line_color='red',
    mode='lines+markers')
trace3 = go.Scatter(
    x=cases['date'],
    y=cases['deaths'],
    name='Deaths',
    line_color='yellow',
    mode='lines+markers')
trace4 = go.Scatter(
    x=cases['date'],
    y=cases['recovered'],
    name='Recovered',
    line_color='green',
    mode='lines+markers')

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)
fig.append_trace(trace4, 1, 4)

fig.update_layout(
    title='World Cases over time')

fig.show()


In [37]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV