# 01_web_crawling.ipynb
**Author: Hoang Ngoc Anh**

This notebook collects social media data from Facebook, Instagram, and YouTube for three beverage brands: Coke, Pepsi, and Fanta.
- **Period**: November 1, 2024 – March 31, 2025  
- **Output**: Saved to `data/` directory

In [None]:
# Install necessary libraries
!pip install requests beautifulsoup4 pandas google-api-python-client
!pip install facebook-scraper instaloader

In [None]:
# Import libraries
import pandas as pd
from facebook_scraper import get_posts
import instaloader
from datetime import datetime
from googleapiclient.discovery import build

## Collecting Data from Facebook (public page posts)

In [None]:
# Define brand Facebook pages
brands = {
    "Coke": "TCCCVN",
    "Pepsi": "Pepsivietnam",
    "Fanta": "fantavietnam"
}

facebook_data = []
for brand, page in brands.items():
    for post in get_posts(page, pages=100):
        if post['time'] and datetime(2024, 11, 1) <= post['time'] <= datetime(2025, 3, 31):
            facebook_data.append({
                'brand': brand,
                'time': post['time'],
                'text': post['text'],
                'likes': post['likes'],
                'shares': post['shares'],
                'comments': post['comments']
            })

df_fb = pd.DataFrame(facebook_data)
df_fb.to_csv('data/facebook_data.csv', index=False)
df_fb.head()

## Collecting Data from Instagram (using Instaloader - public profile only)

In [None]:
L = instaloader.Instaloader()
insta_data = []
for brand in ["cocacola_vn", "pepsivietnam", "fantavietnam"]:
    profile = instaloader.Profile.from_username(L.context, brand)
    for post in profile.get_posts():
        if datetime(2024, 11, 1) <= post.date <= datetime(2025, 3, 31):
            insta_data.append({
                'brand': brand,
                'date': post.date,
                'caption': post.caption,
                'likes': post.likes,
                'comments': post.comments
            })

df_insta = pd.DataFrame(insta_data)
df_insta.to_csv('data/instagram_data.csv', index=False)
df_insta.head()

## Collecting Data from YouTube (requires API Key)

In [None]:
# Replace YOUR_API_KEY with your actual YouTube Data API key
api_key = 'YOUR_API_KEY'
youtube = build('youtube', 'v3', developerKey=api_key)
# Further implementation with pagination and video search can be developed later