In [102]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor
import warnings
import time
import statsmodels.api as sm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

COVID = pd.read_excel('./owid-covid-data.xlsx')
COVID = COVID.fillna(0)
COVID.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0


## 한국 기준

In [103]:
COVID_KOR = COVID[COVID['location'] == 'South Korea']

In [104]:
X = COVID_KOR[['total_cases',
 'excess_mortality_cumulative_absolute',
 'icu_patients',
 'total_vaccinations',
 'stringency_index',
 'new_cases',
 'weekly_hosp_admissions',
 'weekly_icu_admissions',
 'positive_rate',
 'reproduction_rate',
'new_tests',
'total_tests']]
y = COVID_KOR['new_deaths']

In [105]:
kor_gb_reg = GradientBoostingRegressor(random_state=13)
kor_gb_reg.fit(X, y)

GradientBoostingRegressor(random_state=13)

## GBM의 score 확인

In [106]:
model = GradientBoostingRegressor()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.9945211611775414
0.8649495555583884


## RMSE값 확인

In [108]:
kor_pred_GBM = kor_gb_reg.predict(X).round(0)
GBM_rmse = np.sqrt(mean_squared_error(y, kor_pred_GBM)) 

lm = sm.OLS(y, X).fit()
kor_pred_lm = lm.predict(X).round(0)
LM_rmse = np.sqrt(mean_squared_error(y, kor_pred_lm)) 

xg_reg = XGBRegressor(objective ='reg:linear', 
                          colsample_bytree = 0.3, 
                          learning_rate = 0.1, 
                          max_depth = 5, 
                          alpha = 10, 
                          n_estimators = 10)

xg_reg.fit(X,y) 
eur_pred_XGB = xg_reg.predict(X)
XGB_rmse = np.sqrt(mean_squared_error(y, eur_pred_XGB))

print('GBM_rmse : ', GBM_rmse)
print('LM_rmse : ', LM_rmse)
print('XGB_rmse : ', XGB_rmse)

GBM_rmse :  5.697631715011469
LM_rmse :  19.28894102389732
XGB_rmse :  31.50831104975677


## rmse 값이 가장 낮은 GBM을 모델로 선정했다.

In [97]:
COVID_KOR = COVID_KOR[['date', 'location', 'new_deaths']]
COVID_KOR['predict_death'] = kor_pred_GBM

## 미국 기준

In [110]:
COVID_USA = COVID[COVID['iso_code'] == 'USA']

In [112]:
X = COVID_USA[['total_cases',
 'excess_mortality_cumulative_absolute',
 'icu_patients',
 'total_vaccinations',
 'stringency_index',
 'new_cases',
 'weekly_hosp_admissions',
 'weekly_icu_admissions',
 'positive_rate',
 'reproduction_rate',
'new_tests',
'total_tests']]
y = COVID_USA['new_deaths']

In [113]:
usa_gb_reg = GradientBoostingRegressor(random_state=13)
usa_gb_reg.fit(X, y)

usa_pred_GBM = usa_gb_reg.predict(X).round(0)

COVID_USA = COVID_USA[['date', 'location', 'new_deaths']]
COVID_USA['predict_death'] = usa_pred_GBM

COVID_USA[COVID_USA['predict_death']<=0] = 0

## 일본 기준

In [125]:
COVID_JPN = COVID[COVID['iso_code'] == 'JPN']

In [126]:
X = COVID_JPN[['total_cases',
 'excess_mortality_cumulative_absolute',
 'icu_patients',
 'total_vaccinations',
 'stringency_index',
 'new_cases',
 'weekly_hosp_admissions',
 'weekly_icu_admissions',
 'positive_rate',
 'reproduction_rate',
'new_tests',
'total_tests']]
y = COVID_JPN['new_deaths']

In [127]:
jpn_gb_reg = GradientBoostingRegressor(random_state=13)
jpn_gb_reg.fit(X, y)

jpn_pred_GBM = jpn_gb_reg.predict(X).round(0)

COVID_JPN = COVID_JPN[['date', 'location', 'new_deaths']]
COVID_JPN['predict_death'] = jpn_pred_GBM

COVID_JPN[COVID_JPN['predict_death']<=0] = 0

## 유럽 기준

In [128]:
COVID_EUR = COVID[COVID['location'] == 'Europe']

In [129]:
X = COVID_EUR[['total_cases',
 'excess_mortality_cumulative_absolute',
 'icu_patients',
 'total_vaccinations',
 'stringency_index',
 'new_cases',
 'weekly_hosp_admissions',
 'weekly_icu_admissions',
 'positive_rate',
 'reproduction_rate',
'new_tests',
'total_tests']]
y = COVID_EUR['new_deaths']

In [130]:
eur_gb_reg = GradientBoostingRegressor(random_state=13)
eur_gb_reg.fit(X, y)

eur_pred_GBM = eur_gb_reg.predict(X).round(0)

COVID_EUR = COVID_EUR[['date', 'location', 'new_deaths']]
COVID_EUR['predict_death'] = eur_pred_GBM

COVID_EUR[COVID_EUR['predict_death']<=0] = 0

## 아시아 기준

In [132]:
COVID_ASIA = COVID[COVID['location'] == 'Asia']

In [133]:
X = COVID_ASIA[['total_cases',
 'excess_mortality_cumulative_absolute',
 'icu_patients',
 'total_vaccinations',
 'stringency_index',
 'new_cases',
 'weekly_hosp_admissions',
 'weekly_icu_admissions',
 'positive_rate',
 'reproduction_rate',
'new_tests',
'total_tests']]
y = COVID_ASIA['new_deaths']

In [134]:
asia_gb_reg = GradientBoostingRegressor(random_state=13)
asia_gb_reg.fit(X, y)

asia_pred_GBM = asia_gb_reg.predict(X).round(0)

COVID_ASIA = COVID_ASIA[['date', 'location', 'new_deaths']]
COVID_ASIA['predict_death'] = asia_pred_GBM

COVID_ASIA[COVID_ASIA['predict_death']<=0] = 0

## (수입) 3분위 중 1분위 기준

In [136]:
COVID_HIC = COVID[COVID['iso_code'] == 'OWID_HIC']

In [137]:
X = COVID_HIC[['total_cases',
 'excess_mortality_cumulative_absolute',
 'icu_patients',
 'total_vaccinations',
 'stringency_index',
 'new_cases',
 'weekly_hosp_admissions',
 'weekly_icu_admissions',
 'positive_rate',
 'reproduction_rate',
'new_tests',
'total_tests']]
y = COVID_HIC['new_deaths']

In [138]:
hic_gb_reg = GradientBoostingRegressor(random_state=13)
hic_gb_reg.fit(X, y)

hic_pred_GBM = hic_gb_reg.predict(X).round(0)

COVID_HIC = COVID_HIC[['date', 'location', 'new_deaths']]
COVID_HIC['predict_death'] = hic_pred_GBM

COVID_HIC[COVID_HIC['predict_death']<=0] = 0

## (수입) 3분위 중 2분위 기준

In [142]:
COVID_UMC = COVID[COVID['iso_code'] == 'OWID_UMC']

In [143]:
X = COVID_UMC[['total_cases',
 'excess_mortality_cumulative_absolute',
 'icu_patients',
 'total_vaccinations',
 'stringency_index',
 'new_cases',
 'weekly_hosp_admissions',
 'weekly_icu_admissions',
 'positive_rate',
 'reproduction_rate',
'new_tests',
'total_tests']]
y = COVID_UMC['new_deaths']

In [144]:
umc_gb_reg = GradientBoostingRegressor(random_state=13)
umc_gb_reg.fit(X, y)

umc_pred_GBM = umc_gb_reg.predict(X).round(0)

COVID_UMC = COVID_UMC[['date', 'location', 'new_deaths']]
COVID_UMC['predict_death'] = umc_pred_GBM

COVID_UMC[COVID_UMC['predict_death']<=0] = 0

## (수입) 3분위 중 3분위 기준

In [145]:
COVID_LMC = COVID[COVID['iso_code'] == 'OWID_LMC']

In [146]:
X = COVID_LMC[['total_cases',
 'excess_mortality_cumulative_absolute',
 'icu_patients',
 'total_vaccinations',
 'stringency_index',
 'new_cases',
 'weekly_hosp_admissions',
 'weekly_icu_admissions',
 'positive_rate',
 'reproduction_rate',
'new_tests',
'total_tests']]
y = COVID_LMC['new_deaths']

In [147]:
lmc_gb_reg = GradientBoostingRegressor(random_state=13)
lmc_gb_reg.fit(X, y)

lmc_pred_GBM = lmc_gb_reg.predict(X).round(0)

COVID_LMC = COVID_LMC[['date', 'location', 'new_deaths']]
COVID_LMC['predict_death'] = lmc_pred_GBM

COVID_LMC[COVID_LMC['predict_death']<=0] = 0