In [7]:
#import required libraries

# full imports go first
import os # os is a built-in Python library for interacting with the operating system
import requests # requests is a library for managing requests

# partial imports go below
from bs4 import BeautifulSoup # add to requirements.txt: beautifulsoup4
from PIL import Image # add to requirements.txt: pillow

In [8]:
### GETTING ALL IMAGES FROM A PAGE

In [9]:
URL = "https://en.wikipedia.org/wiki/Narwhal"
response = requests.get(URL)
bs_html = BeautifulSoup(response.text, "html.parser")

In [10]:
# preview the html contents
print(bs_html.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Narwhal - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled

In [11]:
#finds all the
image_tags = bs_html.find_all('img')

In [12]:
accepted_image_types = ['jpg', 'jpeg', 'png', 'bmp', 'webp', 'svg']

In [13]:
file_types = []

# get the source 'src' of the image and split it to only leave the file type
for img_tag in image_tags:
  file_types.append(str(img_tag.get('src').split('.')[-1]))


present_file_types = set(file_types)
# sets do not allow duplicates meaning we will be left with one of each file type present

In [14]:
# what image types can we find on this page? Please note there may be some obsolete data in here
present_file_types

{'jpg',
 'org/wiki/Special:CentralAutoLogin/start?useformat=desktop&type=1x1&usesul3=0',
 'png',
 'svg'}

In [15]:
image_tags

[<img alt="" aria-hidden="true" class="mw-logo-icon" height="50" src="/static/images/icons/wikipedia.png" width="50"/>,
 <img alt="Wikipedia" class="mw-logo-wordmark" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"/>,
 <img alt="The Free Encyclopedia" class="mw-logo-tagline" height="13" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" style="width: 7.3125em; height: 0.8125em;" width="117"/>,
 <img alt="This is a good article. Click here for more information." class="mw-file-element" data-file-height="185" data-file-width="180" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/19px-Symbol_support_vote.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/29px-Symbol_support_vote.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/39px-Symbol_support_vote.svg.png 2x" width="19"/>,
 <img alt="Page sem

In [16]:
# how many tags?
len(image_tags)

26

In [17]:
# extract the image URLs from the tags
# create an empty list
img_urls = []

for img_tag in image_tags:

    # if the tag has the 'src' property
    if img_tag.get('src'):

        # extract it
        img_src = img_tag.get('src')

        # get the image type (it's the last bit of text after the '.')
        img_type = img_src.split('.')[-1]

        # skip further steps in the loop for the current image if not in the accepted types
        if img_type.lower() in accepted_image_types and img_src.startswith('//upload'):
            img_urls.append(f'https:{img_src}')

In [18]:
image_tags[5].get('src')

'//upload.wikimedia.org/wikipedia/commons/thumb/b/bc/%D0%9D%D0%B0%D1%80%D0%B2%D0%B0%D0%BB_%D0%B2_%D1%80%D0%BE%D1%81%D1%81%D0%B8%D0%B9%D1%81%D0%BA%D0%BE%D0%B9_%D0%90%D1%80%D0%BA%D1%82%D0%B8%D0%BA%D0%B5.jpg/220px-%D0%9D%D0%B0%D1%80%D0%B2%D0%B0%D0%BB_%D0%B2_%D1%80%D0%BE%D1%81%D1%81%D0%B8%D0%B9%D1%81%D0%BA%D0%BE%D0%B9_%D0%90%D1%80%D0%BA%D1%82%D0%B8%D0%BA%D0%B5.jpg'

In [19]:
# how many URLs did we get?
len(img_urls)

20

In [20]:
# if you preview, quite a lot of them are duplicated
img_urls

['https://upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/19px-Symbol_support_vote.svg.png',
 'https://upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png',
 'https://upload.wikimedia.org/wikipedia/commons/thumb/b/bc/%D0%9D%D0%B0%D1%80%D0%B2%D0%B0%D0%BB_%D0%B2_%D1%80%D0%BE%D1%81%D1%81%D0%B8%D0%B9%D1%81%D0%BA%D0%BE%D0%B9_%D0%90%D1%80%D0%BA%D1%82%D0%B8%D0%BA%D0%B5.jpg/220px-%D0%9D%D0%B0%D1%80%D0%B2%D0%B0%D0%BB_%D0%B2_%D1%80%D0%BE%D1%81%D1%81%D0%B8%D0%B9%D1%81%D0%BA%D0%BE%D0%B9_%D0%90%D1%80%D0%BA%D1%82%D0%B8%D0%BA%D0%B5.jpg',
 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ac/Narwhal_size.svg/220px-Narwhal_size.svg.png',
 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/Status_iucn3.1_LC.svg/220px-Status_iucn3.1_LC.svg.png',
 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/8a/OOjs_UI_icon_edit-ltr.svg/15px-OOjs_UI_icon_edit-ltr.svg.png',
 'https://upload.wikimedia.org/wikipedia/common

In [21]:
# how many unique?
unique_urls = set(img_urls)
len(unique_urls)

20

In [22]:
# name of the folder where we want to sve the images. CAPITALS suggest it's a constant
IMAGES_DIRECTORY = "scraped_images"

current_dirs = os.listdir() # this function lists all the contents of the current folder (where the notebook is)
current_dirs

['.git',
 'challengers_accolades.csv',
 'cleaned_exercise_data.csv',
 'data.csv',
 'DAZ_code_example_1_wikipedia.ipynb',
 'DAZ_code_example_2_webscraper_images.ipynb',
 'new_user_info.txt',
 'README.md',
 'Session_2_labs.ipynb',
 'team_info.csv',
 'user_info.txt',
 'venv']

In [23]:
# if the folder where we want to save the images is not already there, create it
if IMAGES_DIRECTORY not in current_dirs:
    os.mkdir(IMAGES_DIRECTORY)  # this directory will be created in the same location where your notebook is

In [24]:
errors = []

requests.adapters.DEFAULT_RETRIES = 10

# the "enumerate" function allows for iteration while also supplying an index for each item
for img_index, img_url in enumerate(unique_urls):

    # get the data from the image url
    resp = requests.get(img_url, stream=True)

    # if the request is not completed
    if resp.status_code != 200:
        # add the image url to the errors list
        errors.append(img_url)

    # otherwise, proceed
    else:
        # create a PIL.Image object
        obj_img = Image.open(resp.raw)
        # get the file extension from the url
        img_type = img_url.split('.')[-1]
        # save the image in its origial extension
        obj_img.save(f'./{IMAGES_DIRECTORY}/img_{img_index}.{img_type}')

In [25]:
# how many errors?
len(errors)

7

In [26]:
# let's see what's happened here!
errors[0]

'https://upload.wikimedia.org/wikipedia/commons/thumb/b/be/Narwhal_tail_above_surface.jpg/220px-Narwhal_tail_above_surface.jpg'

In [27]:
resp = requests.get(errors[0], stream=True)

In [28]:
resp.status_code # 403 is the status code for "Permission denied"

403