In [None]:
import concurrent.futures
from tqdm import tqdm
import os
import pandas as pd
import time
import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException

def scrape_weather_to_df(name):
    # Set up options để giảm tài nguyên sử dụng
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-notifications')
    options.add_argument('--blink-settings=imagesEnabled=false')  # Tắt tải hình ảnh
    
    driver = webdriver.Chrome(options=options)
    url = f"https://www.worldweatheronline.com/{name}-weather-history/vn.aspx"
    driver.get(url)
    
    data = []
    try:
        allow_cookies = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.ID, "CybotCookiebotDialogBodyButtonAccept"))
        )
        allow_cookies.click()
        time.sleep(1)
    except (NoSuchElementException, TimeoutException):
        pass
    
    record_keys = ['Time', 'Weather', 'Temp', 'Rain', 'Cloud', 'Pressure', 'Wind', 'Gust']
    date = datetime.datetime(2025, 4, 2)
    end_date = datetime.datetime(2025, 4, 5)
    
    try:
        while date < end_date:
            date_str = date.strftime('%Y-%m-%d')
            try:
                input_date = WebDriverWait(driver, 7).until(
                    EC.presence_of_element_located((By.ID, 'ctl00_MainContentHolder_txtPastDate'))
                )
                driver.execute_script("arguments[0].value = arguments[1];", input_date, date_str)
                submit_date = driver.find_element(By.ID, 'ctl00_MainContentHolder_butShowPastWeather')
                submit_date.click()
            except WebDriverException:
                date += datetime.timedelta(days=1)
                continue
            
            time.sleep(1)
            
            tables = driver.find_element(By.XPATH, "/html/body/form/div[3]/section/div/div/div/div[3]/div[1]/div/div[3]/table/tbody")
    
            all_rows = tables.find_elements(By.TAG_NAME, "tr")
            rows = all_rows[2:10] 
            
            for row in rows:
                try:
                    cells = row.find_elements(By.CLASS_NAME, "days-details-row-item1")
                    rains = row.find_elements(By.CLASS_NAME, "days-rain-number")
                    rain = rains[0].text
                    weather_img = cells[1].find_element(By.TAG_NAME, "img")
                    weather = weather_img.get_attribute("title")
                    
                    values = [cells[0].text.strip(), weather, cells[2].text.strip(), rain, cells[3].text.strip(), 
                            cells[4].text.strip(), cells[5].text.strip(), cells[6].text.strip()]
                    
                    if values:
                        data.append([date_str] + values)
                except Exception:
                    continue
            
            date += datetime.timedelta(days=1)
    finally:
        driver.quit()
    
    if data:
        df = pd.DataFrame(data, columns=["Date"] + record_keys)
        return df
    else:
        return pd.DataFrame(columns=["Date"] + record_keys)

def process_location(name, output_dir):
    try:
        print(f"Đang crawl dữ liệu từ {name}")
        df = scrape_weather_to_df(name)
        filename = os.path.join(output_dir, f"{name}.csv")
        df.to_csv(filename, index=False)
        return f"✅ Đã lưu dữ liệu {name} vào {filename}", name, True
    except Exception as e:
        return f" Lỗi khi crawl {name}: {e}", name, False

def main():
    output_dir = r"D:\hust_materials\DE\weather\Big_data_project\data_weather"
    os.makedirs(output_dir, exist_ok=True)
    
    name_tinh_process = ["ha-noi", "ho-chi-minh-city", "da-nang"] 
    
   
    max_workers = 3  
    
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Gửi các tác vụ
        future_to_name = {
            executor.submit(process_location, name, output_dir): name 
            for name in name_tinh_process
        }
        
        # Hiển thị tiến trình
        with tqdm(total=len(name_tinh_process), desc=" Crawling weather data") as pbar:
            for future in concurrent.futures.as_completed(future_to_name):
                name = future_to_name[future]
                try:
                    message, location, success = future.result()
                    print(message)
                    results.append((location, success))
                except Exception as e:
                    print(f" Lỗi với {name}: {e}")
                    results.append((name, False))
                pbar.update(1)
    
   
    

if __name__ == "__main__":
    main()

Đang crawl dữ liệu từ ha-noi
Đang crawl dữ liệu từ ho-chi-minh
Đang crawl dữ liệu từ da-nang


🌤️ Crawling weather data:  33%|███▎      | 1/3 [00:42<01:25, 42.63s/it]

✅ Đã lưu dữ liệu ha-noi vào D:\hust_materials\DE\weather\Big_data_project\data_weather\ha-noi.csv


🌤️ Crawling weather data:  67%|██████▋   | 2/3 [00:44<00:18, 18.82s/it]

✅ Đã lưu dữ liệu ho-chi-minh vào D:\hust_materials\DE\weather\Big_data_project\data_weather\ho-chi-minh.csv


🌤️ Crawling weather data: 100%|██████████| 3/3 [00:47<00:00, 15.71s/it]

✅ Đã lưu dữ liệu da-nang vào D:\hust_materials\DE\weather\Big_data_project\data_weather\da-nang.csv

✅ Đã crawl thành công: 3/3



