In [12]:
# Imports
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import random
import time

## Managing visited pages
We are going to have to track which of the paginated index pages we have been to that all start with the overall 'gin' index page defined below.  

To find all available links that are found in the pagination section of the HTML, we can use the following code. 

In [2]:
url = "https://www.masterofmalt.com/gin/"
cookies = dict(MaOMa='VisitorID=556630649&IsVATableCountry=1&CountryID=464&CurrencyID=-1&CountryCodeShort=GB&DeliveryCountrySavedToDB=1')
html = requests.get(url, headers = {"Accept-Language": "en-GB"}, cookies = cookies).text 
soup = BeautifulSoup(html, features="html.parser")
pagination = soup.find_all(class_='list-paging')
pagination_links = pagination[0].find_all('a')
[link.get('href') for link in pagination_links]

['https://www.masterofmalt.com/gin/',
 'https://www.masterofmalt.com/gin/2',
 'https://www.masterofmalt.com/gin/3',
 'https://www.masterofmalt.com/gin/4',
 'https://www.masterofmalt.com/gin/5',
 'https://www.masterofmalt.com/gin/6']

This is going to be something that we are going to do a lot of times; makes sense to create a function that does what we need it to and stops at an appropriate time. 

In [3]:
def get_pagination_links(url, html_class = 'list-paging'):
    """
    For a given url find the href attributes of anchor tags that match a certain HTML class.
    Returns a set of URLs.
    """
    cookies = dict(MaOMa='VisitorID=556630649&IsVATableCountry=1&CountryID=464&CurrencyID=-1&CountryCodeShort=GB&DeliveryCountrySavedToDB=1')
    html = requests.get(url, headers = {"Accept-Language": "en-GB"}, cookies = cookies).text 
    soup = BeautifulSoup(html, features="html.parser")
    pagination = soup.find_all(class_= html_class)
    pagination_links = pagination[0].find_all('a')
    
    # Return a set of distinct URLs 
    return set([link.get('href') for link in pagination_links])

In [4]:
get_pagination_links("https://www.masterofmalt.com/gin/")

{'https://www.masterofmalt.com/gin/',
 'https://www.masterofmalt.com/gin/2',
 'https://www.masterofmalt.com/gin/3',
 'https://www.masterofmalt.com/gin/4',
 'https://www.masterofmalt.com/gin/5',
 'https://www.masterofmalt.com/gin/6'}

----

In [5]:
starting_url = "https://www.masterofmalt.com/gin/"

# Create master array of pages that we want to visit
all_gin_pages = np.array([starting_url])
visited_pages = np.array([])

# Check for pages that haven't been visited
while len(np.setdiff1d(all_gin_pages, visited_pages)):
    
    # Grab a random choice of unvisited pages
    unvisited_url = random.choice(np.setdiff1d(all_gin_pages, visited_pages))
    
    # Call our function to get pagination links
    pages = get_pagination_links(unvisited_url)
    
    # Add the visited url to the visited pages
    visited_pages.append(unvisited_url)
    
    # Update all the gin pages
    all_gin_pages = np.array(set(all_gin_pages).union(pages))
    
    # Add wait flag to avoid smashing the server
    time.sleep(3)
    
    break

In [8]:
all_gin_pages

array({'https://www.masterofmalt.com/gin/5', 'https://www.masterofmalt.com/gin/3', 'https://www.masterofmalt.com/gin/2', 'https://www.masterofmalt.com/gin/4', 'https://www.masterofmalt.com/gin/', 'https://www.masterofmalt.com/gin/6'},
      dtype=object)

#### Overview of using sets
I thought that sets were the ideal data type for this particular instance. 

In [10]:
len(np.setdiff1d([1,2,3,4,5], [1,2]))

3

In [50]:
set(['url']).union(set(['url', 'url2']))

{'url', 'url2'}