# 📘 01_crawl_script.ipynb

Crawl Facebook posts for Coke, Pepsi, and Fanta from November 2024 to March 2025.

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from datetime import datetime

In [None]:
def crawl_facebook_posts(page_url, brand_name, scroll_times=30):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    service = Service("msedgedriver.exe")  # hoặc chromedriver.exe nếu bạn dùng Chrome

    driver = webdriver.Chrome(service=service, options=options)
    driver.get(page_url)
    time.sleep(5)

    posts = []
    for _ in range(scroll_times):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    elements = driver.find_elements(By.XPATH, '//div[@data-ad-preview="message"]')
    dates = driver.find_elements(By.XPATH, '//a[contains(@href,"/posts/")]/abbr')

    for i in range(min(len(elements), len(dates))):
        content = elements[i].text
        try:
            timestamp = dates[i].get_attribute("data-utime")
            post_time = datetime.utcfromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')
        except:
            post_time = "N/A"

        posts.append({
            'brand': brand_name,
            'post_date': post_time,
            'content': content
        })

    driver.quit()
    return posts

In [None]:
pages = {
    "Coke": "https://www.facebook.com/TCCCVN",
    "Pepsi": "https://www.facebook.com/Pepsivietnam",
    "Fanta": "https://www.facebook.com/fantavietnam"
}

In [None]:
all_posts = []
for brand, url in pages.items():
    print(f"⏳ Crawling {brand}...")
    posts = crawl_facebook_posts(url, brand)
    all_posts.extend(posts)

In [None]:
df = pd.DataFrame(all_posts)
df['post_date'] = pd.to_datetime(df['post_date'], errors='coerce')
df = df[(df['post_date'] >= "2024-11-01") & (df['post_date'] <= "2025-03-31")]
df = df.dropna()

In [None]:
df[df['brand'] == "Coke"].to_csv("../data/crawl_coke_data.csv", index=False)
df[df['brand'] == "Pepsi"].to_csv("../data/crawl_pepsi_data.csv", index=False)
df[df['brand'] == "Fanta"].to_csv("../data/crawl_fanta_data.csv", index=False)

In [None]:
print("✅ Data crawled and saved for all brands.")