In [2]:
import os
import csv
from pathlib import Path
from urllib.request import urlopen, urlretrieve
from urllib.parse import urlparse, urljoin

from bs4 import BeautifulSoup

In [12]:
def get_absolute_url(base_url: str, source: str) -> str:
    is_absolute = source.startswith(("http://", "https://"))
    has_external_domain = is_absolute and base_url not in source

    if has_external_domain:
        return None

    if is_absolute:
        url = source.replace("www.", "")
    elif source.startswith("www."):
        url = urljoin("http://", source.replace("www.", ""))
    else:
        url = urljoin(base_url, source)

    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"


def get_download_path(base_url: str, absolute_url: str, download_dir: str):
    if not base_url.endswith("/"):
        base_url += "/"

    relative_path = absolute_url.replace(base_url, "")
    download_path = Path(download_dir) / relative_path

    if not os.path.exists(download_path.parent):
        os.makedirs(download_path.parent)

    return download_path

In [None]:
download_dir = "downloaded"
base_url = "http://pythonscraping.com"
url = "http://www.pythonscraping.com"

html = urlopen(url)
bs = BeautifulSoup(html, "html.parser")
download_list = bs.find_all(src=True)

for download in download_list:
    if file_url := get_absolute_url(base_url, download["src"]):
        print(file_url)
        download_path = get_download_path(base_url, file_url, download_dir)
        urlretrieve(file_url, download_path)

In [3]:
with open("test.csv", "w+") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(("number", "number plus 2", "number times 2"))
    for i in range(10):
        writer.writerow((i, i + 2, i * 2))

In [6]:
from urllib.request import urlopen

url = "http://en.wikipedia.org/wiki/Comparison_of_text_editors"

html = urlopen(url)
bs = BeautifulSoup(html, "html.parser")
table = bs.select("table.wikitable")[0]
rows = table.findAll("tr")

with open("editors.csv", "w+", newline="") as csv_file:
    writer = csv.writer(csv_file)
    for row in rows:
        cells = row.findAll(["td", "th"])
        writer.writerow([cell.get_text().strip() for cell in cells])

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import pymysql
import re

conn = pymysql.connect(
    host="127.0.0.1",
    unix_socket="/tmp/mysql.sock",
    user="root",
    passwd="root",
    db="mysql",
    charset="utf8",
)
cur = conn.cursor()
cur.execute("USE scraping")

random.seed(datetime.datetime.now())


def store(title, content):
    cur.execute(
        'INSERT INTO pages (title, content) VALUES ("%s", "%s")',
        (title, content),
    )
    cur.connection.commit()


def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org" + articleUrl)
    bs = BeautifulSoup(html, "html.parser")
    title = bs.find("h1").get_text()
    content = bs.find("div", {"id": "mw-content-text"}).find("p").get_text()
    store(title, content)
    return bs.find("div", {"id": "bodyContent"}).findAll(
        "a", href=re.compile("^(/wiki/)((?!:).)*$")
    )


links = getLinks("/wiki/Kevin_Bacon")
try:
    while len(links) > 0:
        newArticle = links[random.randint(0, len(links) - 1)].attrs["href"]
        print(newArticle)
        links = getLinks(newArticle)
finally:
    cur.close()
    conn.close()

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pymysql
from random import shuffle

conn = pymysql.connect(
    host="127.0.0.1",
    unix_socket="/tmp/mysql.sock",
    user="root",
    passwd="root",
    db="mysql",
    charset="utf8",
)
cur = conn.cursor()
cur.execute("USE wikipedia")


def insertPageIfNotExists(url):
    cur.execute("SELECT * FROM pages WHERE url = %s", (url))
    if cur.rowcount == 0:
        cur.execute("INSERT INTO pages (url) VALUES (%s)", (url))
        conn.commit()
        return cur.lastrowid
    else:
        return cur.fetchone()[0]


def loadPages():
    cur.execute("SELECT * FROM pages")
    pages = [row[1] for row in cur.fetchall()]
    return pages


def insertLink(fromPageId, toPageId):
    cur.execute(
        "SELECT * FROM links WHERE fromPageId = %s AND toPageId = %s",
        (int(fromPageId), int(toPageId)),
    )
    if cur.rowcount == 0:
        cur.execute(
            "INSERT INTO links (fromPageId, toPageId) VALUES (%s, %s)",
            (int(fromPageId), int(toPageId)),
        )
        conn.commit()


def pageHasLinks(pageId):
    cur.execute("SELECT * FROM links WHERE fromPageId = %s", (int(pageId)))
    rowcount = cur.rowcount
    if rowcount == 0:
        return False
    return True


def getLinks(pageUrl, recursionLevel, pages):
    if recursionLevel > 4:
        return

    pageId = insertPageIfNotExists(pageUrl)
    html = urlopen("http://en.wikipedia.org{}".format(pageUrl))
    bs = BeautifulSoup(html, "html.parser")
    links = bs.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
    links = [link.attrs["href"] for link in links]

    for link in links:
        linkId = insertPageIfNotExists(link)
        insertLink(pageId, linkId)
        if not pageHasLinks(linkId):
            print("PAGE HAS NO LINKS: {}".format(link))
            pages.append(link)
            getLinks(link, recursionLevel + 1, pages)


getLinks("/wiki/Kevin_Bacon", 0, loadPages())
cur.close()
conn.close()

In [None]:
import smtplib
from email.mime.text import MIMEText

msg = MIMEText("The body of the email is here")

msg["Subject"] = "An Email Alert"
msg["From"] = "ryan@pythonscraping.com"
msg["To"] = "webmaster@pythonscraping.com"

s = smtplib.SMTP("localhost")
s.send_message(msg)
s.quit()

In [None]:
import smtplib
from email.mime.text import MIMEText
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time


def sendMail(subject, body):
    msg = MIMEText(body)
    msg["Subject"] = subject
    msg["From"] = "christmas_alerts@pythonscraping.com"
    msg["To"] = "ryan@pythonscraping.com"

    s = smtplib.SMTP("localhost")
    s.send_message(msg)
    s.quit()


bs = BeautifulSoup(urlopen("https://isitchristmas.com/"), "html.parser")
while bs.find("a", {"id": "answer"}).attrs["title"] == "NO":
    print("It is not Christmas yet.")
    time.sleep(3600)
    bs = BeautifulSoup(urlopen("https://isitchristmas.com/"), "html.parser")
sendMail(
    "It's Christmas!",
    "According to http://itischristmas.com, it is Christmas!",
)