# EDA for COVID-19 Dataset

In [43]:
import os
import cv2
import folium
import imgkit
import branca
import akshare
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
plt.style.use("seaborn-darkgrid")
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.rcParams["axes.unicode_minus"] = False

In [3]:
epidemic_df = akshare.covid_19_history()
epidemic_df.to_csv("data/epidemic_data.csv", index=False)
print("Reports date from {} to {}.".format(epidemic_df["date"].iloc[0], epidemic_df["date"].iloc[-1]))
print("There {} countries report COVID-19.".format(len(epidemic_df["country"].unique())))

Reports date from 2019-12-01 to 2020-04-09.
There 209 countries report COVID-19.


### Data Preprocessing

In [4]:
name_to_code = pd.read_excel("data/name_to_code.xls", header = 1)
name_to_code = name_to_code[name_to_code["Chinese"].str.isupper()==False]
n2c_dict = name_to_code.set_index("Chinese")["Alpha3"].to_dict()

In [5]:
epidemic_df["country"] = epidemic_df["country"].map(lambda x: n2c_dict[x] if x in n2c_dict else np.nan)
epidemic_df.drop(["countryCode", "province", "provinceCode", "city", "cityCode"], axis = 1, inplace = True)
epidemic_df.dropna(inplace = True)
epidemic_df.drop_duplicates(inplace = True)

In [39]:
countries = list(name_to_code["Alpha3"].unique())
n_country = len(countries)
date = list(epidemic_df["date"].unique())
zeros = pd.DataFrame({"country": [countries[i] for i in range(n_country)], 
                      "confirmed": [0 for i in range(n_country)],
                      "suspected":[0 for i in range(n_country)], 
                      "cured":[0 for i in range(n_country)], 
                      "dead":[0 for i in range(n_country)]})
def fill_data(df):
    return pd.concat([df.drop("date", axis=1), zeros]).drop_duplicates("country")
groups = epidemic_df.groupby("date").apply(fill_data).groupby(level=0)

### World Map Visualization

In [103]:
country_geo = "world-countries.json"
map_dict = epidemic_df.set_index("country")["confirmed"].to_dict()
MAX = max(epidemic_df["confirmed"])
color_scale = branca.colormap.LinearColormap(colors=["#fffbf9","#fef2cd","#ff6349","#b90702","#a6311f"],
                                             index=[0,0.0001*MAX,0.5*MAX,0.8*MAX,MAX])

In [104]:
def get_color(group, feature):
    if feature["id"] in group["country"].values:
        value = group[group["country"]==feature["id"]]["confirmed"].values[0]
        return color_scale(value)
    return "#ffffffff"

In [105]:
for name, group in groups:
    world_map=folium.Map(location=[0,0], zoom_start=2)
    folium.features.GeoJson(
        data=country_geo,
        name=name,
        style_function = lambda feature: {
            "fillColor": get_color(group, feature),
            "fillOpacity": 0.7,
            "color": "black",
            "weight": 0.5,
        },
        overlay=False,
        show=False,
        control=False,
        smooth_factor=0.5
    ).add_to(world_map)
    world_map.save("data/html_data/"+name+".html")

In [110]:
driver = webdriver.Chrome(executable_path=os.path.abspath("chromedriver"))
for d in date:
    driver.get("file://"+os.path.abspath("data/html_data/"+d+".html"))
    driver.save_screenshot(os.path.abspath("data/image_data/"+d+".png"))
driver.quit()

In [111]:
for d in date:
    img = cv2.imread(os.path.abspath("data/image_data/"+d+".png"))
    cv2.putText(img, d, (500, 50), cv2.FONT_HERSHEY_SIMPLEX, 1., (50,50,150), 2)
    cv2.imwrite("data/processed_image_data/"+d+".png", img[20:-50,87:-87])

### Convert images to video

In [114]:
img_array = []
for d in date:
    img = cv2.imread("data/processed_image_data/"+d+".png")
    height, width, layers = img.shape
    size = (width,height)
    img_array.append(img)

out = cv2.VideoWriter('project.mp4', cv2.VideoWriter_fourcc(*'DIVX'), 3, size)
 
for i in range(len(img_array)):
    out.write(img_array[i])
out.release()