In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os

## Web-scraping for data collection on Visit York information

The Visit York website has multiple categories, e.g. Things to do, What's on, Eat & Drink, and so on (see site header).

Goal: Scrape selected categories for York Chatbot RAG.

Idea 1:
* Each category has multiple sub-categories. Go to each subcategory and scrape the hyperlinks inside, which are then used to scrape content for RAG.
* Issue: these hyperlinks are paginated using Google Tag Manager, which makes it impossible to scrape individual pages as the url stays the same even at different pages. Which brings me to the next idea:

Idea 2:
* Save each pagination's web source as html on local computer, and try to scrape hyperlinks from these html.
* Didn't work as the divider classes changed on html compared to web source. Like "??!!"

Idea 3:
* For each sub-category, manually copy-paste each hyperlinks into a file.
* Use the file to scrape content.

In [2]:
# url_list = ["https://visityork.org/business-directory/category/things-to-do", 
#             "https://visityork.org/whats-on",
#            "https://visityork.org/business-directory/category/eat-drink",
#            "https://visityork.org/business-directory/category/shopping",
#            "https://visityork.org/beyond-york",
#            "https://visityork.org/explore",
#            "https://visityork.org/blog",
#            "https://visityork.org/visitor-information"]

## POC: Scrape one category

In [3]:
# url = "https://visityork.org/business-directory/category/things-to-do"
# folder = "./html/"
# files = os.listdir(path=folder)

In [4]:
# with open(folder + files[0], "rb") as f:
#     html = f.read()
# bs = BeautifulSoup(html, 'html.parser')
# print(bs.prettify())

In [5]:
# res = bs.find_all('a')
# res = bs.find_all('a', {"class":"btn btn-sm btn--outline w-100 "})

In [6]:
# len(res)

In [7]:
# res

In [8]:
# for a in res:
#     # print("\n------------------------------------\n")
#     print(a.get('href'))

In [9]:
url = "https://visityork.org/business-directory/york-army-museum"
# url = "https://visityork.org/business-directory/company-of-merchant-taylors-york"
response = requests.get(url)
response

<Response [200]>

In [10]:
# type(response.content)

In [11]:
bs = BeautifulSoup(response.content, 'html.parser')

In [12]:
# print(bs.prettify())

In [13]:
# print(bs.getText())
res = bs.find_all('div', {'class':'business-entry'})
len(res)

1

In [14]:
paragraph = res[0].find_all('p')
len(paragraph)

11

In [15]:
for p in paragraph:
    print(p)

<p class="mb-0 font-family-nunito font-weight-bold text-secondary">
Socials:</p>
<p>The York Army Museum offers visitors a unique experience. We curate the history of two current serving regiments of the British Army: The Royal Dragoon Guards and The Royal Yorkshire Regiment. One cavalry, one infantry; each of these regiments can trace its history back over three hundred years. Both have a historic connection to Yorkshire and its people. Following a recent £1 million refurbishment, York Army Museum is now open as an exciting and engaging visitor attraction, with something to offer for all ages. Take a virtual tour: <a href="http://www.yorkarmymuseum.co.uk/">click here</a></p>
<p>The museum has been a finalist in the Visit York Tourism Awards Small Attractions category every year since 2016, and won the award in 2018.</p>
<p>Monday: 10am to 5pm</p>
<p>Tuesday: 10am to 5pm</p>
<p>Wednesday: 10am to 5pm</p>
<p>Thursday: 10am to 5pm</p>
<p>Friday: 10am to 5pm</p>
<p>Saturday: 10am to 5pm</

In [16]:
for p in paragraph:
    print("\n"+p.getText())



Socials:

The York Army Museum offers visitors a unique experience. We curate the history of two current serving regiments of the British Army: The Royal Dragoon Guards and The Royal Yorkshire Regiment. One cavalry, one infantry; each of these regiments can trace its history back over three hundred years. Both have a historic connection to Yorkshire and its people. Following a recent £1 million refurbishment, York Army Museum is now open as an exciting and engaging visitor attraction, with something to offer for all ages. Take a virtual tour: click here

The museum has been a finalist in the Visit York Tourism Awards Small Attractions category every year since 2016, and won the award in 2018.

Monday: 10am to 5pm

Tuesday: 10am to 5pm

Wednesday: 10am to 5pm

Thursday: 10am to 5pm

Friday: 10am to 5pm

Saturday: 10am to 5pm

Sunday: CLOSED

Last admission is 30 minutes before closing time.


In [17]:
info = "\n".join([p.getText().strip() for p in paragraph])
info

'Socials:\nThe York Army Museum offers visitors a unique experience. We curate the history of two current serving regiments of the British Army: The Royal Dragoon Guards and The Royal Yorkshire Regiment. One cavalry, one infantry; each of these regiments can trace its history back over three hundred years. Both have a historic connection to Yorkshire and its people. Following a recent £1 million refurbishment, York Army Museum is now open as an exciting and engaging visitor attraction, with something to offer for all ages. Take a virtual tour: click here\nThe museum has been a finalist in the Visit York Tourism Awards Small Attractions category every year since 2016, and won the award in 2018.\nMonday: 10am to 5pm\nTuesday: 10am to 5pm\nWednesday: 10am to 5pm\nThursday: 10am to 5pm\nFriday: 10am to 5pm\nSaturday: 10am to 5pm\nSunday: CLOSED\nLast admission is 30 minutes before closing time.'

In [18]:
print(info)

Socials:
The York Army Museum offers visitors a unique experience. We curate the history of two current serving regiments of the British Army: The Royal Dragoon Guards and The Royal Yorkshire Regiment. One cavalry, one infantry; each of these regiments can trace its history back over three hundred years. Both have a historic connection to Yorkshire and its people. Following a recent £1 million refurbishment, York Army Museum is now open as an exciting and engaging visitor attraction, with something to offer for all ages. Take a virtual tour: click here
The museum has been a finalist in the Visit York Tourism Awards Small Attractions category every year since 2016, and won the award in 2018.
Monday: 10am to 5pm
Tuesday: 10am to 5pm
Wednesday: 10am to 5pm
Thursday: 10am to 5pm
Friday: 10am to 5pm
Saturday: 10am to 5pm
Sunday: CLOSED
Last admission is 30 minutes before closing time.


In [23]:
import json

info_json = json.dumps({"info":info})
with open(f"{os.getcwd()}/json_files/test-save.json", "w+") as f:
    json.dump({"info":info}, f)

In [None]:
# subpage_filter = "https://visityork.org/"
# subpages = [a.get('href') for a in bs.find_all("a") if a.get('href') and a.get('href').startswith(subpage_filter)]
# subpages    

In [None]:
len(list(set(subpages)))

In [None]:
subpages = list(set(subpages))
subpages

In [None]:
response = requests.get(url)
response

In [None]:
bs = BeautifulSoup(response.content, 'html.parser')
print(bs.prettify())

In [None]:
# subsubpage_filter = "https://visityork.org/"
# subsubpages = [a.get('href') for a in bs.find_all('a') if a.get('href') and a.get('href').startswith(subsubpage_filter)]

In [None]:
# subsubpages

In [None]:
len(res)

In [None]:
res