-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdownloadXkcd.py
59 lines (55 loc) · 2.2 KB
/
downloadXkcd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#! python3
# downloadXkcd.py - Downloads every single XKCD comic.
# Importing all the necessary libraries
import requests
import os
import bs4
url = 'http://xkcd.com' # starting url
directory = 'xkcd_comics'
parent_dir = ".\Web-Scraping-Projects"
path = os.path.join(parent_dir, directory)
try:
os.makedirs(path, exist_ok=True) # store comics in ./xkcd_comics
print(f"Directory {directory} created successfully.")
except OSError as error:
print(f"Directory {error} can not be created.")
while not url.endswith('#'):
# TODO: Download the page.
# save the html data for the given webpage
res = requests.get(url)
# Handling all the exceptions (If encountered):
try:
res.raise_for_status()
# good practice to ensure that the program stops if a bad download occurs.
except Exception as err:
print(f'There was a problem: {err}')
# creating the beautifulsoup4 object from the xkcd webpage stored in the requests response object
xkcdsoup = bs4.BeautifulSoup(res.text)
# TODO: Find the URL of the comic image.
# extracts: All elements named <img> within an `id` attribute of `comic`
# Can be found by using the inspect element.
comicElem = xkcdsoup.select('#comic img')
if comicElem == []:
print('Could not find the comic image.')
else:
try:
comicUrl = 'http:' + comicElem[0].get('src') # get the image url.
# Download the image.
print(f"Downloading the image {comicUrl}")
res = requests.get(comicUrl)
res.raise_for_status()
except requests.exceptions.MissingSchema:
# Skip this comic
prevlink = xkcdsoup.select('a[rel="prev"]')[0]
url = 'http://xkcd.com' + prevlink.get('href')
continue
# TODO: Download the image.
imageFile = open(os.path.join(path, os.path.basename(comicUrl)), 'wb')
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
# TODO: Save the image to ./xkcd_comics
# TODO: Get the Prev button's url.
prevLink = xkcdsoup.select('a[rel="prev"]')[0]
url = 'http://xkcd.com' + prevLink.get('href')
print('Done.')