# <b>Mục tiêu</b>
- Thu thập dữ liệu về thời tiết tại TP HCM theo từng ngày từ năm 2010 đến năm 2021.

- Các trường dữ liệu sẽ thu thập bao gồm:
    + Weather Type: Loại thời tiết
    + Highest Temperature: Nhiệt độ cao nhất trong ngày ($^\circ C$)
    + Lowest Temperature: Nhiệt độ thấp nhất trong ngày ($^\circ C$)
    + Wind Speed: Tốc độ gió (km/h)
    + Rain: Lượng mưa (mm)
    + Humidity: Độ ẩm (%)
    + Cloud: Độ che phủ của mây (%)
    + Pressure: Áp suất không khí (mb)

# <b>Import</b>

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
%pip install bs4

Note: you may need to restart the kernel to use updated packages.


In [2]:
from bs4 import BeautifulSoup

In [4]:
%pip install Selenium

Collecting Selenium
  Downloading selenium-4.6.0-py3-none-any.whl (5.2 MB)
Collecting trio~=0.17
  Downloading trio-0.22.0-py3-none-any.whl (384 kB)
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting outcome
  Downloading outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting sortedcontainers
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting exceptiongroup>=1.0.0rc9
  Downloading exceptiongroup-1.0.4-py3-none-any.whl (14 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
Installing collected packages: sortedcontainers, outcome, h11, exceptiongroup, async-generator, wsproto, trio, trio-websocket, Selenium
Successfully installed Selenium-4.6.0 async-generator-1.10 exceptiongroup-1.0.4 h11-0.14.0 outcome-1.2.0 sortedc

In [5]:
%pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-3.8.5-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Downloading python_dotenv-0.21.0-py3-none-any.whl (18 kB)
Note: you may need to restart the kernel to use updated packages.
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-0.21.0 webdriver-manager-3.8.5


In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# <b>Thu thập dữ liệu</b>

In [4]:
url = 'https://www.worldweatheronline.com/ho-chi-minh-city-weather-history/vn.aspx'
days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
months = 12
year = 2010
weather_info_list = []

In [7]:
# Thiết lập ChromeDriver
service = ChromeService(executable_path=ChromeDriverManager().install())
options = Options()
options.headless = True
options.add_argument('--headless')
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(service=service, options= options)

driver.get(url)

search_date = driver.find_element(by= By.XPATH, value= '//*[@id="ctl00_MainContentHolder_txtPastDate"]')
search_date.send_keys('15-11-2020')
search_date.send_keys(Keys.RETURN)

get_weather_button = driver.find_element(by= By.XPATH, value= '//*[@id="ctl00_MainContentHolder_butShowPastWeather"]')
get_weather_button.click()

try:
    weather_info = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="aspnetForm"]/section[2]/div/div/div[1]/div[6]/div[2]'))
    )

    for i in range(2, 14):
        # Năm
        year_info = weather_info.find_element(by= By.XPATH, value= f'//*[@id="aspnetForm"]/section[2]/div/div/div[1]/div[6]/div[2]/div[{i}1]')

        # Loại thời tiết
        weather_type_info = weather_info.find_element(by= By.XPATH, value= f'//*[@id="aspnetForm"]/section[2]/div/div/div[1]/div[6]/div[2]/div[{i}2]/img')

        # Nhiệt độ cao nhất
        highest_temp_info = weather_info.find_element(by= By.XPATH, value= f'//*[@id="aspnetForm"]/section[2]/div/div/div[1]/div[6]/div[2]/div[{i}3]')

        # Nhiệt độ thấp nhất
        lowest_temp_info = weather_info.find_element(by= By.XPATH, value= f'//*[@id="aspnetForm"]/section[2]/div/div/div[1]/div[6]/div[2]/div[{i}4]')

        # Tốc độ gió
        wind_speed_info = weather_info.find_element(by= By.XPATH, value= f'//*[@id="aspnetForm"]/section[2]/div/div/div[1]/div[6]/div[2]/div[{i}5]')

        weather_in_date = {
            'Date': '{}-{}-{}'.format(int(year_info.text), 11, 15),
            'Weather Type': weather_type_info.get_attribute("title"),
            'Highest Temperature': re.findall(r'\d+', highest_temp_info.text)[0],
            'Lowest Temperature': re.findall(r'\d+', lowest_temp_info.text)[0],
            'Wind Speed': re.findall(r'\d+', wind_speed_info.text)[0]
        }

        weather_info_list.append(weather_in_date)
finally:
    driver.quit()

In [13]:
weather_info_list

[{'Date': '2010-11-15',
  'Weather Type': 'Moderate or heavy rain shower',
  'Highest Temperature': '33',
  'Lowest Temperature': '24',
  'Wind Speed': '6'},
 {'Date': '2011-11-15',
  'Weather Type': 'Patchy rain possible',
  'Highest Temperature': '31',
  'Lowest Temperature': '23',
  'Wind Speed': '4'},
 {'Date': '2012-11-15',
  'Weather Type': 'Moderate or heavy rain shower',
  'Highest Temperature': '26',
  'Lowest Temperature': '23',
  'Wind Speed': '9'},
 {'Date': '2013-11-15',
  'Weather Type': 'Moderate rain',
  'Highest Temperature': '26',
  'Lowest Temperature': '23',
  'Wind Speed': '7'},
 {'Date': '2014-11-15',
  'Weather Type': 'Patchy rain possible',
  'Highest Temperature': '34',
  'Lowest Temperature': '24',
  'Wind Speed': '6'},
 {'Date': '2015-11-15',
  'Weather Type': 'Sunny',
  'Highest Temperature': '34',
  'Lowest Temperature': '25',
  'Wind Speed': '11'},
 {'Date': '2016-11-15',
  'Weather Type': 'Patchy rain possible',
  'Highest Temperature': '32',
  'Lowest Te

# <b>Lưu dữ liệu</b>