# Web scraping: headers, the networks tab and parsing an API URL
## Helpful links and resources
- [urllib](https://docs.python.org/3/library/urllib.parse.html#) is a Python library that will pick apart URLs
- [Sessions object - request library](https://docs.python-requests.org/en/master/user/advanced/#session-objects)

## Table of contents
1. The networks tab and adnaced scraping
    1. Static data files
    1. "Secret" APIs
        1. Target's search API
        1. Target's aggregation API
1. Using sessions to login
    1. Accessing password-protected pages

In [1]:
#import libraries
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
import json

## The networks tab and advanced scraping
### Static data files
[Covid cases in the US - New York Times](https://www.nytimes.com/interactive/2021/us/covid-cases.html)

In [2]:
# get static data file
covid_cases_r = requests.get('https://static01.nyt.com/newsgraphics/2021/coronavirus-tracking/data/pages/usa/data.json')

In [3]:
covid_cases = covid_cases_r.json()

In [4]:
# covid_cases

### "Secret" APIs
Shopping websites are good candidates for secret APIs, such as [Target](www.target.com)

#### Target's Search API

In [5]:
# search for an item with the networks tab open to ID which APIs you can use
# parse the URL so it's easier to read
parsed_url = urlparse('https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v1?key=ff457966e64d5e877fdbad070f276d18ecec4a01&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&keyword=paper+plates&offset=0&page=%2Fs%2Fpaper+plates&platform=desktop&pricing_store_id=2850&scheduled_delivery_store_id=2850&store_ids=2850%2C1849%2C3284%2C3229%2C3249&useragent=Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.114+Safari%2F537.36&visitor_id=017A71BED83F0201BCBD154FC5FC4C74')

In [6]:
# check the parsed URL
parsed_url

ParseResult(scheme='https', netloc='redsky.target.com', path='/redsky_aggregations/v1/web/plp_search_v1', params='', query='key=ff457966e64d5e877fdbad070f276d18ecec4a01&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&keyword=paper+plates&offset=0&page=%2Fs%2Fpaper+plates&platform=desktop&pricing_store_id=2850&scheduled_delivery_store_id=2850&store_ids=2850%2C1849%2C3284%2C3229%2C3249&useragent=Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.114+Safari%2F537.36&visitor_id=017A71BED83F0201BCBD154FC5FC4C74', fragment='')

In [7]:
# format the endpoint and parameters
endpoint = parsed_url[0] + '://' + parsed_url[1] + parsed_url[2]
params = {}
for parameter in parsed_url[4].split('&'):
    key_value = parameter.split('=')
    params[key_value[0]] = key_value[1]
print(endpoint), print(params)

https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v1
{'key': 'ff457966e64d5e877fdbad070f276d18ecec4a01', 'channel': 'WEB', 'count': '24', 'default_purchasability_filter': 'true', 'include_sponsored': 'true', 'keyword': 'paper+plates', 'offset': '0', 'page': '%2Fs%2Fpaper+plates', 'platform': 'desktop', 'pricing_store_id': '2850', 'scheduled_delivery_store_id': '2850', 'store_ids': '2850%2C1849%2C3284%2C3229%2C3249', 'useragent': 'Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.114+Safari%2F537.36', 'visitor_id': '017A71BED83F0201BCBD154FC5FC4C74'}


(None, None)

In [8]:
# change something in the parameters (like keyword)
params['keyword'] = 'paper+cups'

In [9]:
# get request with endpoint and params
r = requests.get(endpoint, params=params)

In [10]:
# drill down the json file
len(r.json()['data']['search']['products'])

25

In [11]:
# drill down some more
r.json()['data']['search']['products'][1]['parent']

KeyError: 'parent'

#### Target's aggregation API

In [None]:
# parse the URL so it's easier to read
target_list = urlparse('https://redsky.target.com/redsky_aggregations/v1/web/plp_fulfillment_v1?key=ff457966e64d5e877fdbad070f276d18ecec4a01&tcins=81107269%2C81068829%2C14135567%2C81068792%2C82079503%2C81829962%2C81068790%2C81506339%2C80935950%2C81107259%2C81068797%2C11069188%2C81506334%2C81107271%2C81068773%2C81180792%2C81107267%2C81068789%2C81068796%2C81506336%2C81107268%2C81068821%2C81564691%2C81953908%2C81068815%2C81068825%2C81068787%2C81564688&store_id=2850&zip=11201&state=NY&latitude=40.690&longitude=-74.000&scheduled_delivery_store_id=2850')

In [None]:
# check the parsed URL
target_list

In [None]:
# format the endpoint and parameters
target_list_endpoint = target_list[0] + '://' + target_list[1] + target_list[2]
target_list_params = {}
for parameter in target_list[4].split('&'):
    key_value = parameter.split('=')
    target_list_params[key_value[0]] = key_value[1]

In [None]:
# change something in the parameters (like tcins)
target_list_params['tcins'] = '81107269'

In [None]:
# get request with endpoint and params
target_list_r = requests.get(target_list_endpoint, params=target_list_params)

In [None]:
# drill down the json file
target_list_r.json()['data']['product_summaries']

In [None]:
# drill down some more
target_list_r.json()['data']['product_summaries'][0]

## Using sessions to login
### Accessing password-protected pages
[Sessions object - request library](https://docs.python-requests.org/en/master/user/advanced/#session-objects)

In [None]:
# open up a session so that your login credentials are saved
session = requests.Session()

In [None]:
with open('../config/config.json') as json_file:
    config = json.load(json_file)

In [None]:
payload = {
    'username':'katiemarriner',
    'password': config['atom_password'],
}

In [None]:
# post the payload to the site to log in
s = session.post("https://atom.finance/session/signin", data=payload)

In [None]:
s.text

In [None]:
payload = {
    "variables":{"symbol":"SPY"},
    "query": "query getETFProfile($symbol: String!) {\n  etfProfile(symbol: $symbol) {\n    id\n    issuer\n    description\n    }\n}\n"
}

In [None]:
# Navigate to the next page and scrape the data
s = session.post('https://atom.finance/graphql', json=payload)

In [None]:
s.text