# Introduction

My goal is to predict the confirmed case of Japan in April.

# Import libraries

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import Image
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import KFold
from sklearn.inspection import plot_partial_dependence, partial_dependence
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_curve, roc_auc_score
import xgboost as xgb
from six import StringIO

# Import datasets

In [60]:
data = pd.read_csv("enriched_covid19.csv")
data_test = pd.read_csv("test.csv")
data_example = pd.read_csv("submission.csv")

# Check the data

In [61]:
data.shape

(18816, 38)

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18816 entries, 0 to 18815
Data columns (total 38 columns):
Id                18816 non-null int64
Country_Region    18816 non-null object
Province_State    8000 non-null object
Date              18816 non-null object
ConfirmedCases    18816 non-null float64
Fatalities        18816 non-null float64
age_0-4           18816 non-null float64
age_5-9           18816 non-null float64
age_10-14         18816 non-null float64
age_15-19         18816 non-null float64
age_20-24         18816 non-null float64
age_25-29         18816 non-null float64
age_30-34         18816 non-null float64
age_35-39         18816 non-null float64
age_40-44         18816 non-null float64
age_45-49         18816 non-null float64
age_50-54         18816 non-null float64
age_55-59         18816 non-null float64
age_60-64         18816 non-null float64
age_65-69         18816 non-null float64
age_70-74         18816 non-null float64
age_75-79         18816 non-null flo

In [63]:
data.head()

Unnamed: 0,Id,Country_Region,Province_State,Date,ConfirmedCases,Fatalities,age_0-4,age_5-9,age_10-14,age_15-19,...,smokers_perc,density,urbanpop,hospibed,lung,femalelung,malelung,restrictions,quarantine,schools
0,1,Afghanistan,,2020-01-22,0.0,0.0,0.145717,0.139133,0.133376,0.118922,...,21.389448,60.0,25.0,0.5,37.62,36.31,39.33,0,0,0
1,2,Afghanistan,,2020-01-23,0.0,0.0,0.145717,0.139133,0.133376,0.118922,...,21.389448,60.0,25.0,0.5,37.62,36.31,39.33,0,0,0
2,3,Afghanistan,,2020-01-24,0.0,0.0,0.145717,0.139133,0.133376,0.118922,...,21.389448,60.0,25.0,0.5,37.62,36.31,39.33,0,0,0
3,4,Afghanistan,,2020-01-25,0.0,0.0,0.145717,0.139133,0.133376,0.118922,...,21.389448,60.0,25.0,0.5,37.62,36.31,39.33,0,0,0
4,5,Afghanistan,,2020-01-26,0.0,0.0,0.145717,0.139133,0.133376,0.118922,...,21.389448,60.0,25.0,0.5,37.62,36.31,39.33,0,0,0


In [64]:
data.tail()

Unnamed: 0,Id,Country_Region,Province_State,Date,ConfirmedCases,Fatalities,age_0-4,age_5-9,age_10-14,age_15-19,...,smokers_perc,density,urbanpop,hospibed,lung,femalelung,malelung,restrictions,quarantine,schools
18811,29360,Zimbabwe,,2020-03-21,3.0,0.0,0.141119,0.14874,0.129252,0.110267,...,15.8,95.0,67.0,2.7,20.61,16.065,26.7,0,0,0
18812,29361,Zimbabwe,,2020-03-22,3.0,0.0,0.141119,0.14874,0.129252,0.110267,...,15.8,95.0,67.0,2.7,20.61,16.065,26.7,0,0,0
18813,29362,Zimbabwe,,2020-03-23,3.0,1.0,0.141119,0.14874,0.129252,0.110267,...,15.8,95.0,67.0,2.7,20.61,16.065,26.7,0,0,0
18814,29363,Zimbabwe,,2020-03-24,3.0,1.0,0.141119,0.14874,0.129252,0.110267,...,15.8,95.0,67.0,2.7,20.61,16.065,26.7,0,0,0
18815,29364,Zimbabwe,,2020-03-25,3.0,1.0,0.141119,0.14874,0.129252,0.110267,...,15.8,95.0,67.0,2.7,20.61,16.065,26.7,0,0,0


# Preprocess the data

In [65]:
# index as date
data = data.set_index("Date")

In [66]:
data["Country_Region"].value_counts()

US_Rhode Island                     64
Mexico                              64
China_Hong Kong                     64
Belgium                             64
US_Colorado                         64
Chad                                64
Malaysia                            64
Angola                              64
Canada_Ontario                      64
Canada_Quebec                       64
Canada_Newfoundland and Labrador    64
Cuba                                64
United Kingdom_Montserrat           64
Papua New Guinea                    64
Vietnam                             64
Mali                                64
US_New Hampshire                    64
Saint Vincent and the Grenadines    64
Austria                             64
US_North Dakota                     64
China_Henan                         64
US_Oregon                           64
Niger                               64
Costa Rica                          64
Uruguay                             64
Bangladesh               

In [85]:
# Focus on the major countries
US_California = data[data.Country_Region=="US_California"]
Italy = data[data.Country_Region=="Italy"]
France = data[data.Country_Region=="France"]
Spain = data[data.Country_Region=="Spain"]
Germany = data[data.Country_Region=="Germany"]
Portugal = data[data.Country_Region=="Portugal"]
Iran = data[data.Country_Region=="Iran"]
South_Korea = data[data.Country_Region=="South_Korea"]
Japan = data[data.Country_Region=="Japan"]
Singapore = data[data.Country_Region=="Singapore"]
Malaysia = data[data.Country_Region=="Malaysia"]

In [80]:
# New dataframe
Confirmed = pd.DataFrame({"Italy":Italy["ConfirmedCases"], "France": France["ConfirmedCases"], "Spain": Spain["ConfirmedCases"], 
                   "Germany": Germany["ConfirmedCases"], "Portugal":Portugal["ConfirmedCases"], "Iran": Iran["ConfirmedCases"],
                   "South_Korea": South_Korea["ConfirmedCases"], "Japan": Japan["ConfirmedCases"],
                   "Singapore": Singapore["ConfirmedCases"], "Malaysia": Malaysia["ConfirmedCases"]})