# LAB 01: MỐI QUAN HỆ CỦA DỮ LIỆU

## 1. Thông tin nhóm

| MSSV     | HỌ VÀ TÊN          |
|:--------:|:-------------------|
| 20127323 | Võ Nhật Tân        |
| 20127447 | Ngô Đức Bảo        |
| 20127275 | Lê Nguyễn Nhật Phú |
| 20127681 | Nguyễn Thiên Phúc  |


## 2. Lấy thông tin từ website

In [8]:
import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime, timedelta
from bs4 import BeautifulSoup

np.set_printoptions(formatter=dict(float='{:0.2f}'.format))

In [9]:
def pre_process_html(table_html):
    table_html = table_html.replace('>N/A<', '>-<')
    
    table = BeautifulSoup(table_html)

    def show_data_continent(tag):
        if tag.name == 'th':
            return True
        
        return 'data-continent' in tag.attrs and tag.name in ['td', 'th'] and tag['data-continent'] != ""

    for k, th in enumerate(table.find_all(show_data_continent)):
        raw_s = th.text
        th['style'] = "color: yellow"


    for k, th in enumerate(table.find_all('th')):
        th.string = th.text

    return table.prettify()

In [10]:
driver = webdriver.Chrome()
driver.maximize_window()

driver.get("https://www.worldometers.info/coronavirus/#main_table")
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "main_table_countries_today")))

#Get updated date
updated_date = driver.find_element(By.CSS_SELECTOR, 'div#page-top+div')
today = datetime.strptime(updated_date.text, "Last updated: %B %d, %Y, %H:%M %Z")

#Get today data
table_element_today = driver.find_element(By.ID, 'main_table_countries_today')
table_html_today = table_element_today.get_attribute('outerHTML')

#Get yesterday data
driver.find_element(By.ID, 'nav-yesterday-tab').click()
table_element_yesterday = driver.find_element(By.ID, 'main_table_countries_yesterday')
table_html_yesterday = table_element_yesterday.get_attribute('outerHTML')

#Get 2-day ago data
driver.find_element(By.ID, 'nav-yesterday2-tab').click()
table_element_yesterday = driver.find_element(By.ID, 'main_table_countries_yesterday2')
table_html_yesterday2 = table_element_yesterday.get_attribute('outerHTML')

driver.quit()

In [11]:
table_html = pre_process_html('\n'.join([table_html_today, table_html_yesterday, table_html_yesterday2]))
df_list = pd.read_html(table_html, keep_default_na=False)

for day in range(3):
    exp_day = today - timedelta(days=day)
    df_list[day].to_csv(f'./raw_data/{exp_day.strftime("%d-%m-%Y")+"_raw"}.csv', index=False, encoding='UTF-8')

## Tiền xử lý dữ liệu

In [12]:
from os import listdir

In [13]:
def pre_process_file(file_path):
    df = pd.read_csv(file_path)

    #Rename the abnormal column's name
    df.rename(columns={'Tot\xa0Cases/1M pop': 'Tot Cases/1M pop'}, inplace=True)

    #Drop the unnecessary columns
    df.drop(columns=['#', 'Serious,Critical'], inplace=True)

    #Drop the total row
    df = df.drop(labels=[len(df) - 1]).reset_index(drop=True)
    
    #Fill the null value with 0 and replace the NA (-) values by np.nan
    df.fillna(0, inplace=True)
    df.replace(to_replace='-', value=np.nan, inplace=True)

    #Change the type of columns
    columns_name = df.select_dtypes(include='object').columns
    for column_name in columns_name:
        if column_name in ['Country,Other', 'Continent'] : continue
        df[column_name] = df[column_name].astype('float')

    #Check if the data has inconsistent values
    sub_df = df.iloc[1:-1, :]

    cal_tot = round(sub_df['TotalCases'] / sub_df['Population'] * 10**6).replace([np.inf, -np.inf], 0)
    cal_deaths = round(sub_df['TotalDeaths'] / sub_df['Population'] * 10**6).replace([np.inf, -np.inf], 0)

    if not np.all(sub_df['Tot Cases/1M pop'] == cal_tot):
        raise Exception(f"Data in the Tot Cases/ 1M pop column is not consistent")
    
    if not np.all(sub_df['Deaths/1M pop'] == cal_deaths):
        raise Exception(f"Data in the Total Deaths column is not consistent")

    return df

In [14]:
#Apply the preprocess to all the file in raw_data and save them to data folder
files = [f for f in listdir('./raw_data/')]

for file in files:
    url = './raw_data/' + file
    print(url)
    df = pre_process_file(url)
    df.to_csv('./data/' + file.replace('_raw', ''), index=False, encoding='UTF-8')

./raw_data/01-03-2023_raw.csv
./raw_data/02-03-2023_raw.csv
./raw_data/03-03-2023_raw.csv
./raw_data/04-03-2023_raw.csv
./raw_data/05-03-2023_raw.csv
./raw_data/26-02-2023_raw.csv
./raw_data/27-02-2023_raw.csv
./raw_data/28-02-2023_raw.csv


## Trực quan hóa dữ liệu