## Helpers functions needed to work with Facebook endpoint

In [1]:
import requests, re, pandas as pd, numpy as np
from itertools import product
from bs4 import BeautifulSoup as bs

In [2]:
def request(method='GET', full_url='', resource='', q={}):
    """
    resource: Endpoint on FB Graph API
    q: Access Token, Parameters and Fields according to FB Graph API documentation for specified endpoint
    """
    response = None
    if method == 'GET':
        if full_url:
            response = requests.get(full_url)
        if resource:
            response = requests.get(f'https://graph.facebook.com/v13.0/{resource}', q)
        if response.status_code==400:
            print(f'Error Status Code 400 for URL: {response.url}')
            return response
    return response.json()


def read_endpoint(id_, endpoint, access_token, fields=[], params={}, n=10000):
    """
    read any endpoint for Facebook Graph API
    
    id_: identifier for any of Business, Account, Campaign, Ad Set, Ad, Creative, Ad Image, Ad Library
    endpoint: endpoint for any of the above according to Facebook Graph API
    access_token: required for usage of Facebook Graph API
    fields: fields to be read for the specified endpoint
    params: parameters to be passed for the specified endpoint
    n: number of desired results to be returned
    """
    q = dict({'access_token': access_token, 'fields': ','.join(fields)}, **params)
    if id_ == '':
        response = request(method='GET', resource=endpoint, q=q)
    else:
        response = request(method='GET', resource=f'{id_}/{endpoint}', q=q)
    if endpoint == '':
        return response
    
    data = []
    while True:
        try:
            data.extend(response['data'])
            if 'next' not in response['paging'].keys():
                break
            if len(data) >= n:
                break
            response = request(method='GET', full_url=response['paging']['next'])
        except Exception as e:
            break # not accessible
    return data

In [3]:
ACCESS_TOKEN = 'EAAEau61SMogBAKXf5ZCiYZC2WkmfCAkZCEz1cQoZBTgg4AdxRAAmG5ZAcH8hrSEsXPDZBPFodOdKr7s49i4ZBecZAZBnnV86z5HMCZAaYp1s2WoUj5P5PzYXTX6Gao2awkZBBudMZBubxwlG3UeRUZBCsBTb5GCeHbAsJFdf0Bal6Rpou4ORNt56cpVpK'

## Ad Library API endpoint documentation:
- Parameters: https://developers.facebook.com/docs/graph-api/reference/ads_archive/#parameters
- Fields: https://developers.facebook.com/docs/marketing-api/reference/archived-ad/#fields

In [43]:
# fields available to read as defined by documentation
FIELDS = ['id','ad_creation_time','ad_delivery_start_time','ad_delivery_stop_time',
          'ad_snapshot_url','ad_creative_bodies','ad_creative_link_captions','ad_creative_link_titles','ad_creative_link_descriptions',
          'languages',
          'page_id','page_name','bylines','currency','spend','impressions','estimated_audience_size','publisher_platforms',
          'demographic_distribution','delivery_by_region']

In [44]:
# parameters to use when reading the endpoint - as you can see 'search_terms' (related to what), 'media_type' (what kind of ads), 
# 'ad_reached_counties' (ads of where), etc

params = {'search_terms': '',
          'search_type': 'KEYWORD_UNORDERED',
          'media_type': 'IMAGE',
          'ad_active_status': 'ALL',
          'ad_reached_countries': ['US']}

In [45]:
def obtain_ads(search_term, country, fields, access_token):
    """
    a function to obtain ads from Ad Library based on given 'search_term' and 'country'
    """
    params = {'search_terms': search_term,
              'search_type': 'KEYWORD_UNORDERED',
              'media_type': 'IMAGE',
              'ad_active_status': 'ALL',
              'ad_reached_countries': [country]}
    
    ads = read_endpoint('', 'ads_archive', access_token, fields, params, 30000)
    
    for ad in ads: # add additional information
        ad['topic'] = search_term 
        ad['country'] = country
    return ads

### Example

In [46]:
# countries = ['US', 'GB', 'BR'] # all available countries for now
# themes = ['ad-tech','ed-tech','agri-tech','fin-tech','bio-tech','health-tech','electric-vehicles','clean-energy',
#           'cloud-computing','quantum-computing','artificial-intelligence','machine-learning','digital-assets',
#           'virtual-reality','cyber-security','software','social-trends','analytics']

# all_ads = []
# for ct in product(themes, countries):
    
#     print(f'Retrieving Ads for: {ct}')
#     ads = obtain_ads(ct[0], ct[1], FIELDS, ACCESS_TOKEN)
#     all_ads.extend(ads)
    
# print(f'Retrieved {len(all_ads)} ads')

In [47]:
from itertools import product

countries = ['US'] # country the search the ads in
search_terms = ['ad_tech'] # search term for the relevant ads

all_ads = []
for ct in product(search_terms, countries):
    
    print(f'Retrieving Ads for: {ct}')
    ads = obtain_ads(ct[0], ct[1], FIELDS, ACCESS_TOKEN)
    all_ads.extend(ads)

print(f'Retrieved {len(all_ads)} ads')

Retrieving Ads for: ('ad_tech', 'US')
Retrieved 337 ads


In [None]:
# raw_dir = f'dataset/ad_library/raw/{str(datetime.today().date()).replace("-","")}'
# if not os.path.exists(raw_dir):
#     os.makedirs(raw_dir)

# # helpers.save_pickle(all_ads, f'{raw_dir}/adlibrary_ads.pkl')

In [51]:
# all_ads = helpers.load_pickle(f'{raw_dir}/adlibrary_ads.pkl')

In [48]:
def get_image_url(source):
    """
    function to extract image_url from the retreived data from Ad Library API
    Why?: the URL obtained via the endpoint is not an image_url but instead a url for a snapshot of the entire ad
    """
    pattern = r'"resized_image_url":"(?:\\.|[^"\\])*"' # find resized_image_url : <image_url>
    img_url = re.findall(pattern, str(source))[0].split('"resized_image_url":')[1].replace('\\', '').replace('"', '').replace("'", '')
    return img_url

def process_ads(ads):
    """
    processes Ads retrieved from Ads Archive (FB Ad Library)
    """
    df = pd.DataFrame(ads)
    clean = []
    for _, row in df.iterrows():
        m = {}
        m['search_term'] = row['topic']
        m['country'] = row['country']
        m['page_id'] = row['page_id']
        m['page_name'] = row['page_name']
        
        m['ad_id'] = row['id']
        m['ad_creation_date'] = row['ad_creation_time']
        m['delivery_start'] = row['ad_delivery_start_time']
        m['delivery_stop'] = row['ad_delivery_stop_time']
        m['ad_url'] = row['ad_snapshot_url']
        
#         try:
#             source = soup(requests.get(row['ad_snapshot_url']).content, 'html.parser')
#             m['image_url'] = get_image_url(source)
#         except:
#             m['image_url'] = np.nan # some ads are wrongly classified as having images
        
        ad_body = row['ad_creative_bodies']
        m['ad_body'] = ad_body[0] if (type(ad_body) == list) else ''
        m['uses_multi_body'] = 1 if ((type(ad_body) == list) and (len(set(ad_body)) > 1)) else 0
        
        link_capt = row['ad_creative_link_captions']
        m['link_caption'] = link_capt[0] if (type(link_capt) == list) else ''
        m['uses_multi_capt'] = 1 if ((type(link_capt) == list) and (len(set(link_capt)) > 1)) else 0
        
        link_title = row['ad_creative_link_titles']
        m['link_title'] = link_title[0] if (type(link_title) == list) else ''
        m['uses_multi_title'] = 1 if ((type(link_title) == list) and (len(set(link_title)) > 1)) else 0
        
        link_desc = row['ad_creative_link_descriptions']
        m['link_description'] = link_desc[0] if (type(link_desc) == list) else ''
        m['uses_multi_desc'] = 1 if ((type(link_desc) == list) and (len(set(link_desc)) > 1)) else 0
        
        lang = row['languages']
        m['language'] = ','.join(lang) if (type(lang) == list) else ''
        m['uses_multi_lang'] = 1 if ((type(lang) == list) and (len(set(lang)) > 1)) else 0
        
        m['funded_by'] = row['bylines'] if type(row['bylines']) == str else ''
        m['currency'] = row['currency']
        
        spend = row['spend']
        m['min_spend'] = spend.get('lower_bound', np.nan)
        m['max_spend'] = spend.get('upper_bound', np.nan)
        
        impressions = row['impressions']
        m['min_impressions'] = spend.get('lower_bound', np.nan)
        m['max_impressions'] = spend.get('upper_bound', np.nan)
        
        audience_size = row['estimated_audience_size']
        m['min_audience_size'] = audience_size.get('lower_bound', np.nan) if type(audience_size)==dict else np.nan
        m['max_audience_size'] = audience_size.get('upper_bound', np.nan) if type(audience_size)==dict else np.nan
        
        platforms = row['publisher_platforms']
        m['facebook'] = 1 if ((type(platforms) == list) and ('facebook' in platforms)) else 0
        m['instagram'] = 1 if ((type(platforms) == list) and ('instagram' in platforms)) else 0
        m['messenger'] = 1 if ((type(platforms) == list) and ('messenger' in platforms)) else 0
        m['audience_network'] = 1 if ((type(platforms) == list) and ('audience_network' in platforms)) else 0
        
        m['demographic_dist'] = row['demographic_distribution']
        clean.append(m)
        
    return pd.DataFrame(clean)

In [49]:
df = process_ads(all_ads)

### Example of what the dataset would rougly look like

In [50]:
df

Unnamed: 0,search_term,country,page_id,page_name,ad_id,ad_creation_date,delivery_start,delivery_stop,ad_url,ad_body,...,max_spend,min_impressions,max_impressions,min_audience_size,max_audience_size,facebook,instagram,messenger,audience_network,demographic_dist
0,ad_tech,US,100470349235347,Save Texas,762749274711708,2022-05-03,2022-05-03,2022-05-06,https://www.facebook.com/ads/archive/render_ad...,In a technical report updated for the first ti...,...,99,0,99,50001,100000,1,0,0,0,"[{'percentage': '0.004551', 'age': '18-24', 'g..."
1,ad_tech,US,100470349235347,Save Texas,563054608471314,2022-05-03,2022-05-03,2022-05-06,https://www.facebook.com/ads/archive/render_ad...,In a technical report updated for the first ti...,...,99,0,99,100001,500000,1,0,0,0,"[{'percentage': '0.003058', 'age': '45-54', 'g..."
2,ad_tech,US,430964644347524,Campaign Ad-Cloud,930566127614072,2022-05-03,2022-05-03,2022-05-08,https://www.facebook.com/ads/archive/render_ad...,Primaries happening NOW... but it's not too la...,...,99,0,99,1001,5000,1,1,0,0,"[{'percentage': '0.001969', 'age': '25-34', 'g..."
3,ad_tech,US,143542762946146,Marcus Murphy,738864343939155,2022-04-07,2022-04-07,2022-04-22,https://www.facebook.com/ads/archive/render_ad...,Lawyer-Murphy’s FactWars.com reveals Criminal-...,...,99,0,99,10001,50000,1,0,0,0,"[{'percentage': '0.000834', 'age': '45-54', 'g..."
4,ad_tech,US,108510735095059,Dr. Jennifer Wilson for City Council District 5,512240050437223,2022-04-07,2022-04-07,2022-04-25,https://www.facebook.com/ads/archive/render_ad...,The voters of District 5 deserve a representat...,...,399,300,399,1001,5000,1,1,0,0,"[{'percentage': '0.000944', 'age': '25-34', 'g..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,ad_tech,US,462716750781078,Californians for Consumer Privacy,470872100010947,2018-05-23,2018-05-23,2018-05-30,https://www.facebook.com/ads/archive/render_ad...,"""Facebook argues that the information it gathe...",...,99,0,99,,,1,1,1,0,"[{'percentage': '7.8E-5', 'age': '13-17', 'gen..."
333,ad_tech,US,462716750781078,Californians for Consumer Privacy,2048542128550310,2018-05-23,2018-05-23,2018-05-29,https://www.facebook.com/ads/archive/render_ad...,"""Facebook argues that the information it gathe...",...,99,0,99,,,1,1,1,0,"[{'percentage': '0.000128', 'age': '13-17', 'g..."
334,ad_tech,US,462716750781078,Californians for Consumer Privacy,958799950911432,2018-05-23,2018-05-23,2018-05-30,https://www.facebook.com/ads/archive/render_ad...,"""Facebook argues that the information it gathe...",...,99,0,99,,,1,1,1,0,"[{'percentage': '0', 'age': '13-17', 'gender':..."
335,ad_tech,US,462716750781078,Californians for Consumer Privacy,406519593159633,2018-05-21,2018-05-21,2018-05-23,https://www.facebook.com/ads/archive/render_ad...,"""Facebook argues that the information it gathe...",...,199,100,199,,,1,1,0,0,"[{'percentage': '0.000356', 'age': '18-24', 'g..."
