# 📥 01_crawl_script.ipynb

Automatically crawl public Facebook posts from 3 official brand pages: **Coca-Cola**, **Pepsi**, and **Fanta** in Vietnam.

In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from datetime import datetime
import os

In [2]:
# Define crawling function
def crawl_facebook_posts(page_url, brand_name, scroll_times=30):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    service = Service('msedgedriver.exe')  # change if using chromedriver
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(page_url)
    time.sleep(5)

    posts = []
    for _ in range(scroll_times):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    messages = driver.find_elements(By.XPATH, '//div[@data-ad-preview="message"]')
    dates = driver.find_elements(By.XPATH, '//a[contains(@href,"/posts/")]/abbr')

    for i in range(min(len(messages), len(dates))):
        content = messages[i].text
        try:
            timestamp = dates[i].get_attribute("data-utime")
            post_time = datetime.utcfromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')
        except:
            post_time = None
        posts.append({
            'brand': brand_name,
            'post_date': post_time,
            'content': content
        })

    driver.quit()
    return posts

In [3]:
# Define Facebook pages to crawl
pages = {
    'Coke': 'https://www.facebook.com/TCCCVN',
    'Pepsi': 'https://www.facebook.com/Pepsivietnam',
    'Fanta': 'https://www.facebook.com/fantavietnam'
}

In [4]:
# Crawl all brands
all_data = []

for brand, url in pages.items():
    print(f"🚀 Crawling: {brand}")
    brand_posts = crawl_facebook_posts(url, brand)
    all_data.extend(brand_posts)

In [5]:
# Convert to DataFrame
df = pd.DataFrame(all_data)
df['post_date'] = pd.to_datetime(df['post_date'], errors='coerce')
df = df[(df['post_date'] >= '2024-11-01') & (df['post_date'] <= '2025-03-31')]
df = df.dropna(subset=['post_date', 'content'])
df.head()

In [6]:
# Save separate CSVs per brand
os.makedirs("../data", exist_ok=True)
df[df['brand'] == 'Coke'].to_csv("../data/crawl_coke_data.csv", index=False)
df[df['brand'] == 'Pepsi'].to_csv("../data/crawl_pepsi_data.csv", index=False)
df[df['brand'] == 'Fanta'].to_csv("../data/crawl_fanta_data.csv", index=False)
print("✅ Data saved to /data")