## Data Collection
This notebook shows how to access the "PlacementSuggestionService" API.
<img src="../data/media/google_ad_portal_youtube_ad_placements.png">

In [2]:
%load_ext autoreload
%autoreload 2

In [9]:
import os
import json
import time
import string
import random
import glob
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from tqdm import tqdm
import pandas as pd

from terms import category2terms

In [18]:
# create request session
s = requests.Session()
retries = Retry(total=5, 
                backoff_factor=2, 
                status_forcelist=[ 500, 501, 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))

In [3]:
# output: where is data saved?
DATA_OUT = '../data/input/placements_api'

## Finding the API
The following `headers`, `params` and `data` are from "ads.google.com".

We get them using DevTools (on Chrome) and listen for network requests while filling out searches for video-based ad placements. The network request was copied as a `cURL` and converted to a Python request using: https://curl.trillworks.com.

You must repeat this step if you wish to use the API (the params here are no longer valid).

Note: We re-write the "`__ar`" data argument for each new keyword.

In [17]:
headers = {
    'authority': 'ads.google.com',
    'x-same-domain': '1',
    'dnt': '1',
    'x-framework-xsrf-token': 'ADqOtbzhpC9ilkCgPxhUmyex3zas00kHTA:1611690490029',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
    'build-version': 'v1611596777',
    'content-type': 'application/x-www-form-urlencoded',
    'accept': '*/*',
    'origin': 'https://ads.google.com',
    'x-client-data': 'CK21yQEIhLbJAQiltskBCMG2yQEIqZ3KAQiWrMoBCMbCygEI98fKAQikzcoBCNzVygEIv4jLAQjsmMsBCJOaywEIzZrLAQjXm8sBCKidywEI4Z3LARj5uMoBGKqbywE=',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'sec-fetch-dest': 'empty',
    'referer': 'https://ads.google.com/aw/campaigns/new/video?ocid=355895228&cmpnInfo=%7B%221%22%3A4%2C%228%22%3A%22a58AAE3A1-154A-42D2-8C67-DF2356BF8AF2--6%22%2C%2219%22%3A%7B%224%22%3A1%7D%7D&euid=354893925&__u=1802067325&uscid=355895228&__c=4335588572&authuser=0&subid=us-en-et-g-aw-c-home-awhp_xin1_signin%21o2',
    'accept-language': 'en-US,en;q=0.9',
    'cookie': 'S=adwords-frontend-video=jhTdc0dl7CJ5xlTTLCchfiiLdWL2pfeOR2VxtW-sX84; CONSENT=YES+US.en+; AdsUserLocale=en_US; adwordsReferralSource=sourceid=emp&subid=us-en-et-g-aw-c-home-awhp_xin1_signin\\u0021o2&clickid=; OSID=5Qc2eG2SYxoMf6QNOrX3oziqzgo1ZhtVu65DfprV7_zBAahASn_ABVn2FdWYmI_E5oc4iA.; __Secure-OSID=5Qc2eG2SYxoMf6QNOrX3oziqzgo1ZhtVu65DfprV7_zBAahAB_aK2zt5bBBqMMP42BzlZg.; _ga_6WWNF0Z6T6=GS1.1.1610122858.5.0.1610122858.0; S=billing-ui-v3=m0PSjxgWhpFRv6Ti0eUvJnIRXMxfZOfh:billing-ui-v3-efe=m0PSjxgWhpFRv6Ti0eUvJnIRXMxfZOfh; ANID=AHWqTUnq2If08O5OctiiWbchH1KYYKXY5qjb251V1J2eyCFls0d5_NwTE_-0WOWR; SID=6Ac2eKvWouPuzWrk3bO8Rm-AOj4QunjmdEhBbiCiILQuqUS5WzuayRYVYXdzOJhD2NqpBQ.; __Secure-3PSID=6Ac2eKvWouPuzWrk3bO8Rm-AOj4QunjmdEhBbiCiILQuqUS5144PiWAwgJaJhDOhJCA-aw.; HSID=ABMlAYSmNkyYGqt_s; SSID=AWWuAWSOEsePTr_Y4; APISID=rpOpGWTxxjQaBX0M/AZe0EcGg3tNA4oeI3; SAPISID=DZYCM2ZYTEvL1Tw1/A_aBH5mC9UsGD3Q2I; __Secure-3PAPISID=DZYCM2ZYTEvL1Tw1/A_aBH5mC9UsGD3Q2I; 1P_JAR=2021-01-26-18; NID=207=Rs34CALhrzFwJAQxzzX8n_FJMO1yIDdEDQ3YFc72SplrIuAW2cFsMtrMjiFE1p3NPWeKWrBcHOyQt3Y8nH3l02Ct3UKB8WxFg_z83PELjZB00IS0sIme9p1wsuHUOp5UkBhNe-ACgg0MnTawJqEsQKPeVi3FcjcSRcEilazALwQ9zJmzE56Hwbhw46zDXj6EiX47zdRxoqy8-jJLErHsSGLrfepdAvVUVam8ssZSUGeHgRSQf6JuECXsQ-liQC2CJzbQj_K-cc35PQbioDh_VU22xI1-iwnoOQWEGXQtKYNkNHDt7SV9DZtDnb0ZYgq63EQT3tBHRAPGIR9kbzkVPKCw-8-617AEpG5h_4UpbDGcni4; S=acx-adwords-navigation-frontend=MylHEJAmywP02kqAMN6_hAc8wsOtn_4DSSnkGr0_hMg; _gid=GA1.3.772351972.1611690392; _gat_UA-113093516-1=1; _ga_3VJ2NVL4LT=GS1.1.1611690391.20.0.1611690483.0; _ga_D9KXP9673K=GS1.1.1611690391.20.0.1611690484.0; _ga=GA1.3.25134381.1598298961; _gat=1; SIDCC=AJi4QfE3cQxiB5uVynnKOa-Jrh5fHgqBWqAZCposEOQuZ9vcusEHPZq_rIqYFhwgrbhBe2aJMD0; __Secure-3PSIDCC=AJi4QfHB30sNqCcgxb7dnta5e4sAEGXDiaD2BPq0Cot8h2wLTA23tiG7BD-_lJ1T4LtmpnJU0O3S',
}

params = (
    ('authuser', '0'),
    ('acx-v-bv', 'awn_video_auto_20210125-0941_RC000'),
    ('acx-v-clt', '1611690522179'),
    ('rpcTrackingId', 'PlacementSuggestionService.Fetch:3'),
    ('f.sid', '-3151454939494509000'),
)

data = {
    'hl': 'en_US',
    '__lu': '354893925',
    '__u': '1802067325',
    '__c': '4335588572',
    'f.sid': '-3151454939494509000',
    'ps': 'aw',
    '__ar': '{"1":"dogs","2":{"1":0,"2":20},"3":[1,4,6,5,2,3],"4":true,"5":false,"8":"355895228","11":["US"],"14":{"1":20}}',
    'activityContext': 'VideoCampaignConstruction.PlacementPickerPanel.ExpansionPanel.PlacementPickerComponent.Search',
    'requestPriority': 'HIGH_LATENCY_SENSITIVE',
    'activityType': 'INTERACTIVE',
    'activityId': '1678814362147311',
    'uniqueFingerprint': '-3151454939494509000_1678814362147311_1',
    'previousPlace': '/aw/campaigns/new/video',
    'activityName': 'VideoCampaignConstruction.PlacementPickerPanel.ExpansionPanel.PlacementPickerComponent.Search',
    'destinationPlace': '/aw/campaigns/new/video'
}

## Query the API

In [10]:
# The keyword lists are stored here.
category2terms.keys()

dict_keys(['social_justice', 'hate', 'policy', 'noise', 'adhoc'])

In [5]:
def query_placements_api(query, fn_out, headers, params, data):
    """Gets JSON from the PlacementSuggestionService API."""
    # format the argument
    data['__ar'] = '{"1":"'+ query +'","2":{"1":0,"2":20},"3":[1,4,6,5,2,3],"4":true,"5":false,"8":"527682421","11":["US"],"13":[1],"14":{"1":20}}'
    
    # make the request
    response = s.post('https://ads.google.com/aw_video/_/rpc/PlacementSuggestionService/Fetch', 
                      headers=headers, params=params, data=data)
    
    # save the JSON request
    with open(fn_out, 'w') as f:
        f.write(json.dumps(response.json()))
    time.sleep(3)

In [7]:
# make a request for each keyword, and save the json response.
for cat, terms in category2terms.items():
    data_dir_ = os.path.join(f'{DATA_OUT}/{cat}')
    os.makedirs(data_dir_, exist_ok=True)
    for term in tqdm(terms):
        fn_out = f'{data_dir_}/{term.lower()}.json'
        if os.path.exists(fn_out):
            continue
        query_placements_api(term, fn_out, headers, params, data)

## Re-run "blocked" responses without spaces
A quick heuristic for "blocked" responses is checking the size. Blocked responses are just two chracters `{}`, so the size is 2.

In [25]:
blocked = []
for fn in glob.glob(DATA_OUT + '/*/*'):
    size = os.stat(fn).st_size
    if size == 2:
        blocked.append(fn)

In [26]:
# Run this notebook if you're repeating queries...
# blocked = [w for w in blocked if 'blocked_basewords' not in w and '/blocked/' not in w]

In [27]:
blocked[1]

'data_1m/raw/policy/bullfighting swords in bull.json'

In [30]:
data_out_2 = f'{DATA_OUT}/blocked'
os.makedirs(data_out_2, exist_ok=True)

In [31]:
# Query each blocked term without spaces.
for fn in tqdm(blocked):
    term = fn.split('/')[3].replace('.json', '')
    new_term = term.replace(' ', '')
    if term == new_term:
        continue
    fn_out = f'{data_out_2}/{term}.json'
    if os.path.exists(fn_out):
        continue
    query_placements_api(new_term, fn_out, headers, params, data)

100%|██████████| 204/204 [08:23<00:00,  2.47s/it]


## Blocked basewords
Split each blocked response into base words.

In [34]:
base_words = []
for fn in blocked:
    term = fn.split('/')[3].replace('.json', '')
    base_words.extend(term.split(' '))

100%|██████████| 204/204 [00:00<00:00, 600447.73it/s]


In [35]:
base_words = list(set(base_words))
len(base_words)

263

In [37]:
data_out_3 = f'{DATA_OUT}/blocked_basewords'
os.makedirs(data_out_3, exist_ok=True)

In [38]:
for term in tqdm(base_words):
    fn_out = f'{data_out_3}/{term}.json'
    if os.path.exists(fn_out):
        continue
    query_placements_api(term, fn_out, headers, params, data)

100%|██████████| 263/263 [20:08<00:00,  4.60s/it]
