# Web scraping: headers, the networks tab and parsing an API URL
## Helpful links and resources
- [urllib](https://docs.python.org/3/library/urllib.parse.html#) is a Python library that will pick apart URLs
- [Sessions object - request library](https://docs.python-requests.org/en/master/user/advanced/#session-objects)

In [1]:
#import libraries
import pandas as pd
import requests
import json
from urllib.parse import urlparse, parse_qs



## The networks tab
### Static data files
[Covid cases in the US - New York Times](https://www.nytimes.com/interactive/2021/us/covid-cases.html)

In [2]:
# get static data file
covid_cases_r = requests.get('https://static01.nyt.com/newsgraphics/2021/coronavirus-tracking/data/pages/usa/data.json')

In [18]:
covid_cases = covid_cases_r.json()
covid_cases

{'updated': 'July 7',
 'updated_datetime': '2021-07-07T09:07:04.772Z',
 'location': {'metadata': {'geoid': 'USA',
   'hierarchy': ['NYT-World'],
   'population': 331811257,
   'slug': 'us/covid-cases',
   'href': 'https://www.nytimes.com/interactive/2021/us/covid-cases.html',
   'display_name': 'United States',
   'long_name': 'United States',
   'nyt_abbr': '',
   'country': 'United States',
   'region': '',
   'subregion': '',
   'region_type': 'country'},
  'anomalies': {'date_based': [{'id': 38,
     'date': '2020-06-25',
     'end_date': '',
     'geoid': 'USA',
     'name': 'United States',
     'type': 'deaths',
     'category': 'added_probables',
     'omit_from_rolling_average': True,
     'omit_from_rolling_average_on_subgeographies': False,
     'short_note': '',
     'long_note': 'New Jersey began reporting probable deaths, including those from earlier in the pandemic, causing a jump in the number of total deaths.'},
    {'id': 46,
     'date': '2020-06-30',
     'end_date'

updated
updated_datetime
location
counties
states
clusters
page_notes
headline_override


### "Secret" APIs
Shopping websites are good candidates for secret APIs, such as [Target](www.target.com)

#### Target's Search API

In [19]:
# search for an item with the networks tab open to ID which APIs you can use
parsed_url = urlparse('https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v1?key=ff457966e64d5e877fdbad070f276d18ecec4a01&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&keyword=paper+cups&offset=0&page=%2Fs%2Fpaper+cups&platform=desktop&pricing_store_id=1254&store_ids=1254%2C2186%2C2202%2C2099%2C1221&useragent=Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.101+Safari%2F537.36&visitor_id=017A7A5800430201BC2B4C168889A04C')

In [20]:
# parse the URL so it's easier to read

ParseResult(scheme='https', netloc='redsky.target.com', path='/redsky_aggregations/v1/web/plp_search_v1', params='', query='key=ff457966e64d5e877fdbad070f276d18ecec4a01&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&keyword=paper+cups&offset=0&page=%2Fs%2Fpaper+cups&platform=desktop&pricing_store_id=1254&store_ids=1254%2C2186%2C2202%2C2099%2C1221&useragent=Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.101+Safari%2F537.36&visitor_id=017A7A5800430201BC2B4C168889A04C', fragment='')

In [29]:
# check the parsed URL
parsed_url[4].split('&')

['key=ff457966e64d5e877fdbad070f276d18ecec4a01',
 'channel=WEB',
 'count=24',
 'default_purchasability_filter=true',
 'include_sponsored=true',
 'keyword=paper+cups',
 'offset=0',
 'page=%2Fs%2Fpaper+cups',
 'platform=desktop',
 'pricing_store_id=1254',
 'store_ids=1254%2C2186%2C2202%2C2099%2C1221',
 'useragent=Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.101+Safari%2F537.36',
 'visitor_id=017A7A5800430201BC2B4C168889A04C']

In [56]:
# format the endpoint and parameters
endpoint = parsed_url[0] + '://' + parsed_url[1] + parsed_url[2]
endpoint


parameters = {}
for parameter in parsed_url[4].split('&'):
    key_value = parameter.split('=')
    parameters[key_value[0]]= key_value[1]
    
print(endpoint),print(parameters)   

https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v1
{'key': 'ff457966e64d5e877fdbad070f276d18ecec4a01', 'channel': 'WEB', 'count': '24', 'default_purchasability_filter': 'true', 'include_sponsored': 'true', 'keyword': 'paper+cups', 'offset': '0', 'page': '%2Fs%2Fpaper+cups', 'platform': 'desktop', 'pricing_store_id': '1254', 'store_ids': '1254%2C2186%2C2202%2C2099%2C1221', 'useragent': 'Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.101+Safari%2F537.36', 'visitor_id': '017A7A5800430201BC2B4C168889A04C'}


(None, None)

In [59]:
# change something in the parameters (like keyword)
parameters['keyword'] = 'paper+plates'
parameters

{'key': 'ff457966e64d5e877fdbad070f276d18ecec4a01',
 'channel': 'WEB',
 'count': '24',
 'default_purchasability_filter': 'true',
 'include_sponsored': 'true',
 'keyword': 'paper+plates',
 'offset': '0',
 'page': '%2Fs%2Fpaper+cups',
 'platform': 'desktop',
 'pricing_store_id': '1254',
 'store_ids': '1254%2C2186%2C2202%2C2099%2C1221',
 'useragent': 'Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.101+Safari%2F537.36',
 'visitor_id': '017A7A5800430201BC2B4C168889A04C'}

In [61]:
# get request with endpoint and params
r = requests.get(endpoint, parameters)

In [88]:
# drill down the json file
r.json()['data']['search']['products']

[{'__typename': 'ProductSummary',
  'tcin': '75666853',
  'original_tcin': '75666853',
  'item': {'relationship_type': 'Stand Alone',
   'relationship_type_code': 'SA',
   'merchandise_classification': {'class_id': 5, 'department_id': 253},
   'eligibility_rules': {'add_on': {'is_active': True},
    'scheduled_delivery': {'is_active': True}},
   'enrichment': {'buy_url': 'https://www.target.com/p/line-plaid-paper-plate-8-5-34-90ct-up-38-up-8482/-/A-75666853',
    'images': {'primary_image_url': 'https://target.scene7.com/is/image/Target/GUEST_39b91919-bb96-44a4-a419-2257cfd40fc5',
     'alternate_image_urls': ['https://target.scene7.com/is/image/Target/GUEST_8ffaba71-1687-4107-9e3d-c49036c358ed']}},
   'dpci': '253-05-0356',
   'cart_add_on_threshold': 35.0,
   'product_description': {'title': 'Line Plaid Paper Plate 8.5&#34; - 90ct - up &#38; up&#8482;',
    'bullet_descriptions': ['<B>Features:</B> Round (shape)',
     '<B>Dimensions (Overall):</B> 8.55 Inches (L), 8.55 Inches (W)',


In [98]:
# drill down some more
products = r.json()['data']['search']['products']

for product in products:
    print(product['item']['product_description']['title'])

Line Plaid Paper Plate 8.5&#34; - 90ct - up &#38; up&#8482;
Textured Dot Paper Plate 10&#34; - 54ct - up &#38; up&#8482;
Dixie Everyday Dinner Paper Plates 8.5&#34; - 55ct
Multi Vine Paper Plate 10&#34; - 54ct - up &#38; up&#8482;
Kids Printed Paper Plate 8.5&#34; - 40ct - up &#38; up&#8482;
Dixie Everyday Dinner Paper Plates 8.5&#34; - 90ct
Plate 10&#34; - 150ct - up &#38; up&#8482;
Coated Disposable Paper Plates - 9&#34;- 120ct - Smartly&#8482;
Dixie Everyday 10 1/16&#34; Paper Plates - 54ct
Dixie Everyday Paper Plates 8.5&#34; - 154ct
10&#34; Plate - 86ct - up &#38; up&#8482;
Chinet Classic White Dinner Plate - 100ct
Plate 7&#34; - White - 58ct - up &#38; up&#8482;
Chinet Classic White Dinner Plate - 32ct
20ct 6.75&#34; Snack Plate Stars Red White Blue - Sun Squad&#8482;
Chinet Lunch Plates Classic White - 120ct
Sprouts Paper Plate 7&#34; - 58ct - up &#38; up&#8482;
Disposable Plates 10&#34; - 20ct - Everspring&#8482;
20ct 8.5&#34; Dinner Plate Confetti Stars on Navy - Sun Squad&#84

#### Target's aggregation API

In [195]:
# parse the URL so it's easier to read
target_list = urlparse('https://redsky.target.com/redsky_aggregations/v1/web/plp_fulfillment_v1?key=ff457966e64d5e877fdbad070f276d18ecec4a01&tcins=81107269%2C81068829%2C14135567%2C81068792%2C82079503%2C81829962%2C81068790%2C81506339%2C80935950%2C81107259%2C81068797%2C11069188%2C81506334%2C81107271%2C81068773%2C81180792%2C81107267%2C81068789%2C81068796%2C81506336%2C81107268%2C81068821%2C81564691%2C81953908%2C81068815%2C81068825%2C81068787%2C81564688&store_id=2850&zip=11201&state=NY&latitude=40.690&longitude=-74.000&scheduled_delivery_store_id=2850')

In [196]:
# check the parsed URL
target_list

ParseResult(scheme='https', netloc='redsky.target.com', path='/redsky_aggregations/v1/web/plp_fulfillment_v1', params='', query='key=ff457966e64d5e877fdbad070f276d18ecec4a01&tcins=81107269%2C81068829%2C14135567%2C81068792%2C82079503%2C81829962%2C81068790%2C81506339%2C80935950%2C81107259%2C81068797%2C11069188%2C81506334%2C81107271%2C81068773%2C81180792%2C81107267%2C81068789%2C81068796%2C81506336%2C81107268%2C81068821%2C81564691%2C81953908%2C81068815%2C81068825%2C81068787%2C81564688&store_id=2850&zip=11201&state=NY&latitude=40.690&longitude=-74.000&scheduled_delivery_store_id=2850', fragment='')

In [197]:
# format the endpoint and parameters
target_list_endpoint = target_list[0] + '://' + target_list[1] + target_list[2]
target_list_params = {}
for parameter in target_list[4].split('&'):
    key_value = parameter.split('=')
    target_list_params[key_value[0]] = key_value[1]

In [198]:
# change something in the parameters (like tcins)
target_list_params['tcins'] = '81107269'

In [199]:
# get request with endpoint and params
target_list_r = requests.get(target_list_endpoint, params=target_list_params)

In [201]:
# drill down the json file
target_list_r.json()['data']['product_summaries']

[{'__typename': 'ProductSummary',
  'tcin': '81107269',
  'fulfillment': {'product_id': '81107269',
   'is_out_of_stock_in_all_store_locations': False,
   'shipping_options': {'availability_status': 'IN_STOCK',
    'loyalty_availability_status': 'IN_STOCK',
    'available_to_promise_quantity': 399.0,
    'minimum_order_quantity': 1.0,
    'services': [{'shipping_method_id': 'STANDARD',
      'min_delivery_date': '2021-07-08',
      'max_delivery_date': '2021-07-08',
      'is_two_day_shipping': True,
      'is_base_shipping_method': True,
      'service_level_description': '2-day shipping',
      'shipping_method_short_description': 'Standard',
      'cutoff': '2021-07-05T16:00:00Z'}]},
   'store_options': [{'location_name': 'Brooklyn Fulton St',
     'location_address': '445 Albee Square West,BROOKLYN,NY,11201-3016',
     'location_id': '2850',
     'search_response_store_type': 'PRIMARY',
     'order_pickup': {'availability_status': 'UNAVAILABLE',
      'reason_code': 'IN_ELIGIBLE'},

In [203]:
# drill down some more
target_list_r.json()['data']['product_summaries'][0]

{'__typename': 'ProductSummary',
 'tcin': '81107269',
 'fulfillment': {'product_id': '81107269',
  'is_out_of_stock_in_all_store_locations': False,
  'shipping_options': {'availability_status': 'IN_STOCK',
   'loyalty_availability_status': 'IN_STOCK',
   'available_to_promise_quantity': 399.0,
   'minimum_order_quantity': 1.0,
   'services': [{'shipping_method_id': 'STANDARD',
     'min_delivery_date': '2021-07-08',
     'max_delivery_date': '2021-07-08',
     'is_two_day_shipping': True,
     'is_base_shipping_method': True,
     'service_level_description': '2-day shipping',
     'shipping_method_short_description': 'Standard',
     'cutoff': '2021-07-05T16:00:00Z'}]},
  'store_options': [{'location_name': 'Brooklyn Fulton St',
    'location_address': '445 Albee Square West,BROOKLYN,NY,11201-3016',
    'location_id': '2850',
    'search_response_store_type': 'PRIMARY',
    'order_pickup': {'availability_status': 'UNAVAILABLE',
     'reason_code': 'IN_ELIGIBLE'},
    'in_store_only': 

## Using sessions to login
### Accessing password-protected pages
[Sessions object - request library](https://docs.python-requests.org/en/master/user/advanced/#session-objects)

In [85]:
# open up a session so that your login credentials are saved
s = requests.Session()

In [99]:
# load in config file with passwords
with open('../config/config.json') as json_file:
    config = json.load(json_file)

FileNotFoundError: [Errno 2] No such file or directory: '../config/config.json'

In [172]:
# check the website for the login parameters


In [173]:
# post the payload to the site to login with the correct log in endpoint
s = session.post(endpoint, data=payload)

In [174]:
# check credentials to see if successful

In [175]:
# look at an example page to get you started with a query

In [177]:
# create a new post object from the example

In [None]:
# post request for the data

In [179]:
# check to see what is returned

'/bundles/web/images/user-image.007dad08.svg'