In [1]:
# Importing libraries
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight') 
# Above is a special style template for matplotlib, highly useful for visualizing time series data
%matplotlib inline
# --- plotly ---

import statsmodels.api as sm
from numpy.random import normal, seed
from scipy.stats import norm
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.arima_model import ARIMA
import math
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error


# 1.load data

In [2]:
all_df = pd.read_csv("data/owid-covid-data.csv",  keep_default_na=False)
population_ = pd.read_csv('data/population_by_country_2020.csv')

In [3]:
population_.loc[population_['Country (or dependency)'] == 'United States', 'Country (or dependency)'] = "US"
population_.loc[population_['Country (or dependency)'] == 'South Korea', 'Country (or dependency)'] = "Korea, South"
population_.loc[population_['Country (or dependency)'] == 'Sao Tome & Principe', 'Country (or dependency)'] = "Sao Tome and Principe"
population_.loc[population_['Country (or dependency)'] == 'Taiwan', 'Country (or dependency)'] = "Taiwan*"

population_.loc[population_['Country (or dependency)'] == 'Myanmar', 'Country (or dependency)'] = "Burma"
population_.loc[population_['Country (or dependency)'] == 'Congo', 'Country (or dependency)'] = "Congo (Brazzaville)"
population_.loc[population_['Country (or dependency)'] == 'DR Congo', 'Country (or dependency)'] = "Congo (Kinshasa)"
population_.loc[population_['Country (or dependency)'] == "Côte d'Ivoire", 'Country (or dependency)'] = "Cote d'Ivoire"

population_.loc[population_['Country (or dependency)'] == 'Czech Republic (Czechia)', 'Country (or dependency)'] = "Czechia"
population_.loc[population_['Country (or dependency)'] == 'Saint Kitts & Nevis', 'Country (or dependency)'] = "Saint Kitts and Nevis"
population_.loc[population_['Country (or dependency)'] == 'St. Vincent & Grenadines', 'Country (or dependency)'] = "Saint Vincent and the Grenadines"


population_.rename({'Population (2020)': 'population', 'Country (or dependency)': 'country', 'Yearly Change': 'yearly_change', 
                 'Net Change': 'net_change', 'Density (P/Km²)': 'density', 'Land Area (Km²)': 'land_area', 
                 'Migrants (net)': 'migrants', 'Fert. Rate': 'rert_rate', 'Med. Age': 'med_age', 
                 'Urban Pop %': 'urban_pop', 'World Share': 'world_share'}, axis=1, inplace=True)
population_["yearly_change"] = population_["yearly_change"].apply(lambda x: x.replace(" ", "").replace("%","")).astype("float")
population_["urban_pop"] = population_["urban_pop"].apply(lambda x: x.replace(" ", "").replace("N.A.","0").replace("%","")).astype("float")
population_["world_share"] = population_["world_share"].apply(lambda x: x.replace(" ", "").replace("%","")).astype("float")

population_["rert_rate"] = population_["rert_rate"].apply(lambda x: x.replace("N.A.", "0.0")).astype("float")
population_["med_age"] = population_["med_age"].apply(lambda x: x.replace("N.A.", "0.0")).astype("float")

population_.rert_rate.fillna(0, inplace=True)
population_.migrants.fillna(0, inplace=True)

population_["net_change"] = population_["net_change"].astype("float")
population_["density"] = population_["density"].astype("float")
population_["land_area"] = population_["land_area"].astype("float")
population_["population"] = population_["population"].astype("float")

population_["r_population"] = population_["population"]

population_["population"] = population_["population"] / max(population_["population"])
population_["yearly_change"] = population_["yearly_change"] / max(population_["yearly_change"])
population_["urban_pop"] = population_["urban_pop"] / max(population_["urban_pop"])
population_["world_share"] = population_["world_share"] / max(population_["world_share"])
population_["net_change"] = population_["net_change"] / max(population_["net_change"])
population_["density"] = population_["density"] / max(population_["density"])
population_["land_area"] = population_["land_area"] / max(population_["land_area"])
population_["rert_rate"] = population_["rert_rate"] / max(population_["rert_rate"])
population_["med_age"] = population_["med_age"] / max(population_["med_age"])
population_["migrants"] = population_["migrants"] / max(population_["migrants"])

In [4]:
def deal_country_data(population_, train_df, oname, nname):
    data_df = train_df[train_df["location"] == oname]
    data_df["location"] = nname
    
    df = pd.DataFrame()
    df["country"] = data_df["location"]
    df["date"] = data_df["date"]
    df["confirmed"] = data_df["new_cases"]
    df["fatalities"] = data_df["new_deaths"]
    # df["population"] = train_df["population"]
    df["population_density"] = data_df["population_density"]
    df["median_age"] = data_df["median_age"]
    df["aged_65_older"] = data_df["aged_65_older"]
    df["aged_70_older"] = data_df["aged_70_older"]
     
    country_df = df.merge(population_, how="left", on=['country']).drop_duplicates()
    
    country_df.confirmed.fillna(0, inplace=True)
    country_df.fatalities.fillna(0, inplace=True)
    country_df["confirmed"] = pd.to_numeric(country_df["confirmed"])
    country_df["fatalities"] = pd.to_numeric(country_df["fatalities"])
    country_df["population_density"] = pd.to_numeric(country_df["population_density"])
    country_df["median_age"] = pd.to_numeric(country_df["median_age"])
    country_df["aged_65_older"] = pd.to_numeric(country_df["aged_65_older"])
    country_df["aged_70_older"] = pd.to_numeric(country_df["aged_70_older"])
    
    country_df["confirmed"] = np.abs(country_df.confirmed) 
    country_df["fatalities"] = np.abs(country_df.fatalities)  
    
#     country_df["confirmed"]  = country_df.confirmed.fillna(0).astype(np.float32)
#     country_df["fatalities"] = country_df.confirmed.fillna(0).astype(np.float32)
    return  country_df

def deal_global_df(population_, train_df, oname, nname):
    data_df = train_df[train_df["location"] == oname]
    data_df["location"] = nname
    
    country_df = pd.DataFrame()
    country_df["country"] = data_df["location"]
    country_df["date"] = data_df["date"]
    country_df["confirmed"] = data_df["new_cases"]
    country_df["fatalities"] = data_df["new_deaths"]
    
    country_df.confirmed.fillna(0, inplace=True)
    country_df.fatalities.fillna(0, inplace=True)
    country_df["confirmed"] = pd.to_numeric(country_df["confirmed"])
    country_df["fatalities"] = pd.to_numeric(country_df["fatalities"])
    
    country_df["confirmed"] = np.abs(country_df.confirmed)
    country_df["fatalities"] = np.abs(country_df.fatalities)
    
#     country_df["confirmed"]  = country_df.confirmed.fillna(0).astype(np.float32)
#     country_df["fatalities"] = country_df.confirmed.fillna(0).astype(np.float32)
    
    list_cur = []
    list_cur.append({"country":nname,
                "population":population_["population"].mean(),
                "yearly_change":population_["yearly_change"].mean(),
                "net_change":population_["net_change"].mean(),
                "density":population_["density"].mean(),
                "land_area":population_["land_area"].mean(),
                "migrants":population_["migrants"].mean(),
                "rert_rate":population_["rert_rate"].mean(),
                "med_age":population_["med_age"].mean(),
                "urban_pop":population_["urban_pop"].mean(),
                "world_share":population_["world_share"].mean()})
    df = pd.DataFrame(list_cur)
    
    data_df = country_df.merge(df, how="left", on=['country']).drop_duplicates()
    
    
    return data_df

def log2_(df):
    df = df[df["confirmed"] >= 3]
    df = df[df["fatalities"] >= 3]
    df["confirmed"] = np.log(np.log(df["confirmed"]))
    df["fatalities"] = np.log(np.log(df["fatalities"]))
    idex = list(range(0, df.shape[0]))
    df["day"] = idex
    return df

def plot_data(t):
    plt.plot(range(0, t.shape[0]), t["confirmed"])
    plt.plot(range(0, t.shape[0]), t["fatalities"])
    plt.title('Confirmed & Fatalities Data')
    plt.ylabel('confirmed & Fatalities')
    plt.xlabel('Epoch')
    plt.show()

def saveCsvDelNa(df, name):
    df = df.dropna(axis=0,how='any')
    df.to_csv("./data/" + name + ".csv")
    
def saveCsvRepNa(df, name):
    df.confirmed = df.confirmed.fillna(0).astype(np.float32)
    df.fatalities = df.fatalities.fillna(0).astype(np.float32)
    df.to_csv("./data/" + name + ".csv")
    
def saveCsv(df, name):
    df.to_csv("./data/" + name + ".csv")
    
    
def addCsv(df): #累加
    temp_df = df.copy()
    temp_df = temp_df.dropna(axis=0,how='any')
    confirmed_ = np.array(temp_df['confirmed'].astype(np.float32)).reshape(-1, 1)
    fatalities_ = np.array(temp_df['fatalities'].astype(np.float32)).reshape(-1, 1)
    for i in range(confirmed_.shape[0] - 1):
        confirmed_[i+1] += confirmed_[i]
    for i in range(fatalities_.shape[0] - 1):
        fatalities_[i+1] += fatalities_[i]    
    temp_df['confirmed'] = confirmed_
    temp_df['fatalities'] = fatalities_
    return temp_df

def plot_data(t):
    fig = plt.figure(figsize=(15,7))
    plt.plot(range(0, t.shape[0]), t["confirmed"])
    plt.plot(range(0, t.shape[0]), t["fatalities"])
    plt.title('Confirmed & Fatalities Data')
    plt.ylabel('confirmed & Fatalities')
    plt.xlabel('Epoch')
    plt.show()

In [5]:

canada_df = deal_country_data(population_, all_df, "Canada", "Canada")
india_df = deal_country_data(population_, all_df, "India", "India")
russia_df = deal_country_data(population_, all_df, "Russia", "Russia")
uk_df = deal_country_data(population_, all_df, "United Kingdom", "United Kingdom")



In [15]:
uk_df[-30:]

Unnamed: 0,country,date,confirmed,fatalities,population_density,median_age,aged_65_older,aged_70_older,population,yearly_change,net_change,density,land_area,migrants,rert_rate,med_age,urban_pop,world_share,r_population
421,United Kingdom,2021-03-27,3909.0,58.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0
422,United Kingdom,2021-03-28,3947.0,21.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0
423,United Kingdom,2021-03-29,4783.0,23.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0
424,United Kingdom,2021-03-30,4071.0,55.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0
425,United Kingdom,2021-03-31,4115.0,43.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0
426,United Kingdom,2021-04-01,4565.0,51.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0
427,United Kingdom,2021-04-02,3422.0,52.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0
428,United Kingdom,2021-04-03,3424.0,10.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0
429,United Kingdom,2021-04-04,2405.0,10.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0
430,United Kingdom,2021-04-05,2831.0,28.0,272.898,40.8,18.517,12.527,0.047152,0.138021,0.02619,0.010669,0.014773,0.272987,0.257143,0.833333,0.83,0.047103,67814098.0


In [9]:
saveCsvDelNa(canada_df, "Canada")
saveCsvDelNa(india_df, "India")
saveCsvDelNa(russia_df, "Russia")
saveCsvDelNa(uk_df, "United Kingdom")

In [8]:
global_df = deal_global_df(population_, all_df, "World", "global")
us_df = deal_country_data(population_, all_df, "United States", "US")
italy_df  = deal_country_data(population_, all_df, "Italy", "Italy")    ####
france_df = deal_country_data(population_, all_df, "France", "France")  ####
japan_df  = deal_country_data(population_, all_df, "Japan", "Japan")    ####


china_df = deal_country_data(population_, all_df, "China", "China")
spain_df = deal_country_data(population_, all_df, "Spain", "Spain")
uk_df = deal_country_data(population_, all_df, "United Kingdom", "United Kingdom")
germany_df = deal_country_data(population_, all_df, "Germany", "Germany")


In [325]:
# saveCsv(global_df, 'global')
# saveCsv(us_df, 'us')
saveCsvRepNa(italy_df.copy(), 'italy_confirmed')
saveCsvRepNa(italy_df.copy(), 'italy_confirmed')
saveCsvRepNa(france_df.copy(), 'france_confirmed')
saveCsvRepNa(japan_df.copy(), 'japan_confirmed')

saveCsvDelNa(italy_df.copy(), 'italy')

saveCsvDelNa(spain_df.copy(), 'spain')
saveCsvDelNa(uk_df.copy(), 'uk')
saveCsvDelNa(germany_df.copy(), 'germany')

saveCsvRepNa(spain_df.copy(), 'spain_confirmed')
saveCsvRepNa(uk_df.copy(), 'uk_confirmed')
saveCsvRepNa(germany_df.copy(), 'germany_confirmed')

In [301]:
addCsv(spain_df.iloc[180:183])

Unnamed: 0,country,date,confirmed,fatalities,population_density,median_age,aged_65_older,aged_70_older,population,yearly_change,net_change,density,land_area,migrants,rert_rate,med_age,urban_pop,world_share,r_population
180,Spain,2020-07-30,2789.0,2.0,93.105,45.5,19.436,13.799,0.032507,0.010417,0.001325,0.003569,0.030458,0.041893,0.185714,0.9375,0.8,0.032485,46751175.0
181,Spain,2020-07-31,5881.0,4.0,93.105,45.5,19.436,13.799,0.032507,0.010417,0.001325,0.003569,0.030458,0.041893,0.185714,0.9375,0.8,0.032485,46751175.0
182,Spain,2020-08-01,5881.0,4.0,93.105,45.5,19.436,13.799,0.032507,0.010417,0.001325,0.003569,0.030458,0.041893,0.185714,0.9375,0.8,0.032485,46751175.0


In [326]:
spain_df_ = pd.read_csv("data/italy.csv")
spain_df_

Unnamed: 0.1,Unnamed: 0,country,date,confirmed,fatalities,population_density,median_age,aged_65_older,aged_70_older,population,yearly_change,net_change,density,land_area,migrants,rert_rate,med_age,urban_pop,world_share,r_population
0,21,Italy,2020-02-21,17.0,1.0,205.859,47.9,23.021,16.24,0.042052,-0.039062,-0.006495,0.007822,0.017961,0.155993,0.185714,0.979167,0.69,0.042231,60479424.0
1,22,Italy,2020-02-22,42.0,1.0,205.859,47.9,23.021,16.24,0.042052,-0.039062,-0.006495,0.007822,0.017961,0.155993,0.185714,0.979167,0.69,0.042231,60479424.0
2,23,Italy,2020-02-23,93.0,1.0,205.859,47.9,23.021,16.24,0.042052,-0.039062,-0.006495,0.007822,0.017961,0.155993,0.185714,0.979167,0.69,0.042231,60479424.0
3,24,Italy,2020-02-24,74.0,4.0,205.859,47.9,23.021,16.24,0.042052,-0.039062,-0.006495,0.007822,0.017961,0.155993,0.185714,0.979167,0.69,0.042231,60479424.0
4,25,Italy,2020-02-25,93.0,3.0,205.859,47.9,23.021,16.24,0.042052,-0.039062,-0.006495,0.007822,0.017961,0.155993,0.185714,0.979167,0.69,0.042231,60479424.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,446,Italy,2021-04-21,13836.0,364.0,205.859,47.9,23.021,16.24,0.042052,-0.039062,-0.006495,0.007822,0.017961,0.155993,0.185714,0.979167,0.69,0.042231,60479424.0
426,447,Italy,2021-04-22,16046.0,360.0,205.859,47.9,23.021,16.24,0.042052,-0.039062,-0.006495,0.007822,0.017961,0.155993,0.185714,0.979167,0.69,0.042231,60479424.0
427,448,Italy,2021-04-23,14758.0,342.0,205.859,47.9,23.021,16.24,0.042052,-0.039062,-0.006495,0.007822,0.017961,0.155993,0.185714,0.979167,0.69,0.042231,60479424.0
428,449,Italy,2021-04-24,13814.0,322.0,205.859,47.9,23.021,16.24,0.042052,-0.039062,-0.006495,0.007822,0.017961,0.155993,0.185714,0.979167,0.69,0.042231,60479424.0


In [312]:
np.sum(uk_df.confirmed)

4430017.0