# Facebook Pages and Ads scraping 

The goal of this notebook is :

- First, to find all the pages on Facebook dealing with a specific topic by scraping Facebook
- Then, to scrape all the ads published by these pages on the Facebook Ad Librabry

You'll need a facbook account to do so, you can either use yours or create one for this task.

The main libraries used are Selenium and BeautifulSoup. This project was strongly inspired by a Medium article : https://medium.com/@mackgrenfell/fixing-the-facebook-ad-library-part-i-scraping-can-save-it-6b737d04614c

In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time

# Scrape Facebook to get all the pages related to a topic

### Set the options so that the chrome driver doesn't ask for notifications settings 

In [2]:
option = Options()

option.add_argument("--disable-infobars")
option.add_argument("start-maximized")
option.add_argument("--disable-extensions")

# Pass the argument 1 to allow and 2 to block
option.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2 })

### Log into Facebook

In [3]:
searchDriver = webdriver.Chrome(options = option, executable_path=ChromeDriverManager().install())
searchDriver.get("https://facebook.com/")

# get rid of the cookie banner
searchDriver.find_element_by_xpath('//*[@data-testid="cookie-policy-banner-accept"]').click()

login = '...' # put your login email address
password = '...' # put your password

usernameBox = searchDriver.find_element_by_name('email')
usernameBox.send_keys(login)
passwordBox = searchDriver.find_element_by_name('pass')
passwordBox.send_keys(password)

try:
    loginBox = searchDriver.find_element_by_id('loginbutton')
except:
    loginBox = searchDriver.find_element_by_name('login')

time.sleep(5)
loginBox.click()

# If you get a click interception error, try to rerun the cell

[WDM] - Current google-chrome version is 86.0.4240
[WDM] - Get LATEST driver version for 86.0.4240
[WDM] - Get LATEST driver version for 86.0.4240
[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/86.0.4240.22/chromedriver_mac64.zip


 


[WDM] - Driver has been saved in cache [/Users/theolanganay/.wdm/drivers/chromedriver/mac64/86.0.4240.22]


### Choose the subject of interest and load the related pages (useless if you want to focus on a single page)

In [4]:
# if your subject has several words, replace the spaces by &20 ex: 'italian cars' would be 'italian&20cars'

subject = 'electric&20bicycle'

# optional filters that removes the non verified pages. If you don't want to do so, remove the '+ opt_filter' in the last command
opt_filter = '&filters=eyJ2ZXJpZmllZCI6IntcIm5hbWVcIjpcInBhZ2VzX3ZlcmlmaWVkXCIsXCJhcmdzXCI6XCJcIn0ifQ%3D%3D'

time.sleep(5)
searchDriver.get('https://www.facebook.com/search/pages/?q='+ subject + opt_filter)

In [5]:
def scrollDown(driver, n):
    # scrolls n times to get the number of pages you want
    for _ in range(n):
        # Scroll down to the bottom. 
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        # Wait to load the page
        time.sleep(5)

In [None]:
# scroll 5 times
scrollDown(searchDriver, 5)

# get the html source code from the page
html = searchDriver.page_source

# parse the html code
soup = BeautifulSoup(html, 'html.parser')
mydivs_url = soup.findAll("a", {"class": "nc684nl6"})

### Store all the pages in a list (useless if you want to focus on a single page)

In [7]:
# list of tuples : (pages url, page name)
pages_fb = []

for div in mydivs_url:
    pages_fb += [[div["href"], div.text]]

In [8]:
pages_fb[:10]

[['https://www.facebook.com/parismarathon/',
  'Schneider Electric Marathon de Paris'],
 ['https://www.facebook.com/SchneiderElectricFR/', 'Schneider Electric'],
 ['https://www.facebook.com/electricguest/', 'Electric Guest'],
 ['https://www.facebook.com/Skryptom/', 'Skryptöm'],
 ['https://www.facebook.com/electricsixofficial/', 'Electric 6'],
 ['https://www.facebook.com/ElectricLightOrchestra/',
  'Electric Light Orchestra'],
 ['https://www.facebook.com/electriccars/', 'Electric Cars'],
 ['https://www.facebook.com/NissanElectric/', 'Nissan Electric'],
 ['https://www.facebook.com/electricwizarddorsetdoom/', 'Electric Wizard'],
 ['https://www.facebook.com/ElectricMoonOfficial/', 'Electric Moon']]

# Scrape the Facebook Ad Library to get the ads posted by the pages

### Open the Facebook Ad Library

In [9]:
# choose the right country
country = 'FR'

#
url = 'https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=' + country + '&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped'
driver = webdriver.Chrome(options = option, executable_path=ChromeDriverManager().install())
driver.get(url)
driver.find_element_by_xpath('//*[@data-testid="cookie-policy-banner-accept"]').click()    

[WDM] - Current google-chrome version is 86.0.4240
[WDM] - Get LATEST driver version for 86.0.4240
[WDM] - Driver [/Users/theolanganay/.wdm/drivers/chromedriver/mac64/86.0.4240.22/chromedriver] found in cache


 


### Option 1 : Scrape all the pages in the list of previously scraped pages 

In [10]:
def scrape_multiple(pages):
    htmls = {}
    for page in pages:
        page_name = page[1]
        #Convert pageID into a string if it isn’t already
        if type(page_name) == int:
            page_name = str(page_name)

        to_fill = driver.find_element_by_xpath("//input[@placeholder='Recherchez des publicités par nom de l’annonceur']")
        to_fill.send_keys(page_name)

        to_click = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CLASS_NAME, '_9ftp')))
        #to_click = driver.find_element_by_class_name('check_res_button')
        to_click.click()

        #Call the scroll_down function from Part I, to load all ads in the ad library
        scrollDown(driver, 2)

        #Retrieve the HTML of the fully loaded ad library page
        html = driver.page_source
        htmls[page_name] = html

        # go back and keep going
        driver.back()
        driver.find_element_by_xpath("//input[@placeholder='Recherchez des publicités par nom de l’annonceur']").clear()
    
    return(htmls)

In [11]:
html_code = scrape_multiple(pages_fb)

# If you get a click interception error, try to rerun the cell

### Option 2 : Scrape the ads of the single page we want to focus on

In [12]:
def scrape_single(page_name):
    htmls = {}
    #Convert pageID into a string if it isn’t already
    if type(page_name) == int:
        page_name = str(page_name)

    to_fill = driver.find_element_by_xpath("//input[@placeholder='Recherchez des publicités par nom de l’annonceur']")
    to_fill.send_keys(page_name)

    to_click = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CLASS_NAME, '_9ftp')))
    #to_click = driver.find_element_by_class_name('check_res_button')
    to_click.click()

    #Call the scroll_down function from Part I, to load all ads in the ad library. If the page is long, increase the number in the function
    scrollDown(driver, 3)

    #Retrieve the HTML of the fully loaded ad library page
    html = driver.page_source
    htmls[page_name] = html

    # go back and keep going
    driver.back()
    driver.find_element_by_xpath("//input[@placeholder='Recherchez des publicités par nom de l’annonceur']").clear()
    
    return(htmls)

### Create a DataFrame of the previously scraped ads

In [14]:
def create_ad_dic(html):
    
    all_ads = {}

    for key in html.keys():
        html_soup = BeautifulSoup(html[key], 'html.parser')
        ads_info = html_soup.findAll("div", {"class": "_9b9p _99s6"})

        ads = {}
        i = 0

        for ad in ads_info:
            ad_info = {}
            ad_info['status'] = ad.find("div", {"class": "_9cd2"}).text
            ad_info['date'] = ad.find("div", {"class": "_9cd3"}).text
            ad_info['text'] = ad.findAll("div", {"class": "_4ik4 _4ik5"})[1].text
            if ad.find("img", {"class": "_7jys img"}) != None :
                ad_info['type'] = 'single'
                ad_info['image'] = ad.find("img", {"class": "_7jys img"})['src']
            elif ad.find("img", {"class": "_7jys _7jyt img"}) != None:
                images = ad.findAll("img", {"class": "_7jys _7jyt img"})
                ad_info['type'] = 'multiple'
                ad_info['image'] = []
                for image in images:    
                    ad_info['image'].append(image['src'])
            else:
                ad_info['type'] = 'video'
                ad_info['image'] = ad.find("div", {"class": "_8o0a _8o0b"}).find("video")['src']
            ads[i] = ad_info
            i += 1

        all_ads[key] = ads
    
    return(all_ads)

In [15]:
ad_dic = create_ad_dic(html_code)

In [16]:
def create_ad_df(ads_dic):
    dic_df = {}
    i = 0
    for page in ads_dic:
        for ad in ads_dic[page]:
            dic_df[i] = list(ads_dic[page][ad].values())
            dic_df[i].insert(0, page)
            i += 1
    df = pd.DataFrame.from_dict(dic_df, orient = 'index', columns = ['Page', 'Status', 'Date', 'Text', 'Type','Image'])
    df.Date = df.Date.apply(lambda x : x.replace('Début de diffusion le ', ''))
    
    return(df)

In [17]:
ad_df = create_ad_df(ad_dic)

In [20]:
ad_df

Unnamed: 0,Page,Status,Date,Text,Type,Image
0,Electric 6,Actif,28 oct 2020,Official Super73 dealer in The Netherlands and...,multiple,[https://scontent.fcdg1-1.fna.fbcdn.net/v/t39....
1,Electric 6,Actif,29 oct 2020,Officieel dealer van WATT. Showroom in Antwerp...,multiple,[https://scontent.fcdg1-1.fna.fbcdn.net/v/t39....
2,Electric 6,Actif,28 oct 2020,Official Onewheel dealer for The Netherlands a...,multiple,[https://scontent.fcdg1-1.fna.fbcdn.net/v/t39....
3,9ELECTRIC,Actif,28 oct 2020,Official Super73 dealer in The Netherlands and...,multiple,[https://scontent.fcdg1-1.fna.fbcdn.net/v/t39....
4,9ELECTRIC,Actif,29 oct 2020,Officieel dealer van WATT. Showroom in Antwerp...,multiple,[https://scontent.fcdg1-1.fna.fbcdn.net/v/t39....
5,9ELECTRIC,Actif,28 oct 2020,Official Onewheel dealer for The Netherlands a...,multiple,[https://scontent.fcdg1-1.fna.fbcdn.net/v/t39....
6,ShargeMe Electric Vehicles,Actif,21 aoû 2019,Imagine if Electric Vehicles could Profitably ...,single,https://scontent.fcdg1-1.fna.fbcdn.net/v/t39.1...
7,Electric Daisy Carnival - EDC Portugal,Actif,17 oct 2020,"If you've not purchased yours yet, don't sleep...",video,https://video.fcdg1-1.fna.fbcdn.net/v/t42.1790...
