# 📥 01_crawl_script.ipynb

Collect Facebook post data from the official pages of Coca-Cola, Pepsi, and Fanta in Vietnam
for the period from **November 2024 to March 2025**.

> ❗ Note: This uses the Graph API v19.0 and requires a valid access token.

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta

In [None]:
# Configuration
BRANDS = {
    'CocaCola': 'TCCCVN',
    'Fanta': 'fantavietnam',
    'Pepsi': 'Pepsivietnam'
}

START_DATE = '2024-11-01'
END_DATE = '2025-03-31'
ACCESS_TOKEN = 'YOUR_FACEBOOK_ACCESS_TOKEN'

In [None]:
# Helper function to get posts for one page
def get_facebook_posts(page_name, start_date, end_date):
    url = f"https://graph.facebook.com/v19.0/{page_name}/posts"
    params = {
        'access_token': ACCESS_TOKEN,
        'fields': 'id,message,created_time,shares,likes.summary(true),comments.summary(true)',
        'since': start_date,
        'until': end_date,
        'limit': 100
    }

    posts = []
    while True:
        response = requests.get(url, params=params).json()
        if 'data' not in response:
            break
        posts.extend(response['data'])
        paging = response.get('paging', {})
        next_url = paging.get('next')
        if not next_url:
            break
        url = next_url
        params = {}  # next_url already has params embedded

    return posts

In [None]:
# Collect and store data from all brands
all_data = []

for brand, page_name in BRANDS.items():
    print(f"Fetching posts for {brand}...")
    posts = get_facebook_posts(page_name, START_DATE, END_DATE)
    for post in posts:
        all_data.append({
            'brand': brand,
            'id': post.get('id'),
            'message': post.get('message', ''),
            'created_time': post.get('created_time'),
            'likes': post.get('likes', {}).get('summary', {}).get('total_count', 0),
            'comments': post.get('comments', {}).get('summary', {}).get('total_count', 0),
            'shares': post.get('shares', {}).get('count', 0)
        })

In [None]:
# Save to CSV
df = pd.DataFrame(all_data)
df['created_time'] = pd.to_datetime(df['created_time'])
df.to_csv("../data/facebook_posts.csv", index=False)
df.head()