# LAB 01: MỐI QUAN HỆ CỦA DỮ LIỆU

## 1. Thông tin nhóm

| MSSV     | HỌ VÀ TÊN          |
|:--------:|:-------------------|
| 20127323 | Võ Nhật Tân        |
| 20127447 | Ngô Đức Bảo        |
| 20127275 | Lê Nguyễn Nhật Phú |
| 20127681 | Nguyễn Thiên Phúc  |


## 2. Lấy thông tin từ website

In [3]:
import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import date, datetime

np.set_printoptions(formatter=dict(float='{:0.2f}'.format))

In [4]:
driver = webdriver.Chrome()
driver.maximize_window()

driver.get("https://www.worldometers.info/coronavirus/#main_table")

today = date.today()
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "main_table_countries_today"))
)

updated_date = driver.find_element(By.CSS_SELECTOR, 'div#page-top+div')
today = datetime.strptime(updated_date.text, "Last updated: %B %d, %Y, %H:%M %Z")

pre_process_html = element.get_attribute('outerHTML').replace('>N/A<', '>-<')

df = pd.read_html(pre_process_html, keep_default_na=False)[0]
df.to_csv(f'./raw_data/{today.strftime("%d-%m-%Y")+"_raw"}.csv', index=False, encoding='UTF-8')

driver.quit()

## Tiền xử lý dữ liệu

In [None]:
from os import listdir

In [None]:
def pre_process_file(file_path):
    df = pd.read_csv(file_path)

    #Rename the abnormal column's name
    df.rename(columns={'Tot\xa0Cases/ 1M pop': 'Tot Cases/ 1M pop'}, inplace=True)

    #Drop the unnecessary columns
    df.drop(columns=['#', 'Serious, Critical'], inplace=True)

    #Drop the total row
    df = df.drop(labels=[len(df) - 1]).reset_index(drop=True)
    
    #Fill the null value with 0 and replace the NA (-) values by np.nan
    df.fillna(0, inplace=True)
    df.replace(to_replace='-', value=np.nan, inplace=True)

    #Change the type of columns
    columns_name = df.select_dtypes(include='object').columns
    for column_name in columns_name:
        if column_name == 'Country, Other': continue
        df[column_name] = df[column_name].astype('float')

    #Check if the data has inconsistent values
    sub_df = df.iloc[1:-1, :]

    cal_tot = round(sub_df['Total Cases'] / sub_df['Population'] * 10**6).replace([np.inf, -np.inf], 0)
    cal_deaths = round(sub_df['Total Deaths'] / sub_df['Population'] * 10**6).replace([np.inf, -np.inf], 0)

    if not np.all(sub_df['Tot Cases/ 1M pop'] == cal_tot):
        raise Exception(f"Data in the Tot Cases/ 1M pop column is not consistent")
    
    if not np.all(sub_df['Deaths/ 1M pop'] == cal_deaths):
        raise Exception(f"Data in the Total Deaths column is not consistent")

    return df

In [None]:
#Apply the preprocess to all the file in raw_data and save them to data folder
files = [f for f in listdir('./raw_data/')]

for file in files:
    url = './raw_data/' + file

    df = pre_process_file(url)
    df.to_csv(f'./data/' + file.replace('_raw', ''), index=False, encoding='UTF-8')

## Trực quan hóa dữ liệu