In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
import os
import re
import time
import requests

In [2]:
def read_links(file_path):
    # Read all the links from the file
    with open(file_path, 'r') as f:
        links = f.readlines()

    # Remove the newline character from the end of each link
    links = [link.strip() for link in links]

    return links

In [3]:
def get_title(soup):
    # Get the title of the page
    title = soup.title.string
    # Remove the " - MachineLearningMastery.com" from the title
    title = title[:-29]
    return title

In [4]:
def get_published_date(soup):
    # Get date from "article:published_time" meta tag
    date = soup.find("meta", property="article:published_time")['content']
    # Convert the date to a more readable format
    date = date[:10]
    return date

In [5]:
def find_advertisement(soup):
    blocks = []

    # Find the button with specific data attributes
    buttons = soup.find_all(
        'button', attrs={'data-leadbox-domain': 'machinelearningmastery.lpages.co'})

    for button in buttons:
        # Find the parent of the button, which is the <center> block
        # This will select the entire <center> block
        blocks.append(button.find_parent('center'))

    # Find the books advertisements
    books = soup.find_all(
        'a', attrs={'href': "/deep-learning-for-computer-vision/"})
    for book in books:
        blocks.append(book.find_parent(class_='widget_text awac-wrapper'))

    return blocks

In [6]:
def custom_selection(tag):
    if tag.name == "p" or tag.name == "li" or tag.name == "h2":
        return True
    return False

In [7]:
def format_content(content):
    formatted_content = []
    for i in range(len(content)):
        text_block = content[i].get_text()
        if text_block == "Comment * ":
            break
        if content[i].name == "li":
            text_block = "- " + text_block
        if content[i].name == "h2":
            text_block = "# " + text_block
        formatted_content.append(text_block)
    # Convert the content to a string
    formatted_content = "\n".join(formatted_content)
    return formatted_content

In [34]:
def get_content(soup):
    # Get content from the article
    base_content = soup.find(class_="col-full", id="content")
    # Remove the id="comments" div from the content
    for div in base_content.find_all("div", id='comments'):
        div.decompose()

    # Remove the advertisements from the content
    advertisements = find_advertisement(soup)
    if len(advertisements) > 0:
        for advertisement in advertisements:
            if type(advertisement) is not type(None):
                advertisement.decompose()

    # Remove related articles from the content
    related_articles = base_content.find_all("div", class_="crp_related")
    for related_article in related_articles:
        related_article.decompose()

    # Filter only the paragraphs
    content = base_content.find_all(custom_selection)
    formatted_content = format_content(content)
    return formatted_content

In [9]:
def get_code_block(soup):
    # Find the <textarea> element by its class name
    textarea = soup.find_all('textarea', class_="urvanov-syntax-highlighter-plain")

    text_content = []
    # Extract the text content
    for i in range(len(textarea)):
        code_content = textarea[i].get_text()
        
        # Strim the ... from start and end of the text if they exist
        if code_content.startswith("..."):
            code_content = code_content[3:]

        text_content.append(code_content)

    formatted_content = "\n".join(text_content)
    return "'''\n" + formatted_content + "\n'''"

In [10]:
def get_data_from_url(urls, delay=0):
    dr = webdriver.Edge()
    meta_data = []
    for i in range(len(urls)):
        print(f"Running url {i}")
        dr.get(urls[i])
        soup = BeautifulSoup(dr.page_source, 'html.parser')

        title = get_title(soup)
        url = urls[i]
        date = get_published_date(soup)
        content = get_content(soup)
        code_block = get_code_block(soup)

        meta_data.append((title, url, date, content, code_block))
        time.sleep(delay)
    
    dr.quit()

    return meta_data

In [11]:
def write_data_to_file(meta_data, base_path):
    for i in range(len(meta_data)):
        title, url, date, content, code_block = meta_data[i]
        file_path = os.path.join(base_path, f"{i}.txt")
        with open(file_path, 'w') as f:
            f.write(title + "\n")
            f.write(url + "\n")
            f.write(date + "\n")
            f.write(content + "\n")
            f.write(code_block + "\n")

In [43]:
len(links)

1396

In [55]:
batch = 10
os.makedirs(f"data/data{batch}/", exist_ok=True)
links = read_links("links.txt")
meta_data = get_data_from_url(links[batch * 100:batch * 100 + 100])
write_data_to_file(meta_data, f"data/data{batch}/")

Running url 0
Running url 1
Running url 2
Running url 3
Running url 4
Running url 5
Running url 6
Running url 7
Running url 8
Running url 9
Running url 10
Running url 11
Running url 12
Running url 13
Running url 14
Running url 15
Running url 16
Running url 17
Running url 18
Running url 19
Running url 20
Running url 21
Running url 22
Running url 23
Running url 24
Running url 25
Running url 26
Running url 27
Running url 28
Running url 29
Running url 30
Running url 31
Running url 32
Running url 33
Running url 34
Running url 35
Running url 36
Running url 37
Running url 38
Running url 39
Running url 40
Running url 41
Running url 42
Running url 43
Running url 44
Running url 45
Running url 46
Running url 47
Running url 48
Running url 49
Running url 50
Running url 51
Running url 52
Running url 53
Running url 54
Running url 55
Running url 56
Running url 57
Running url 58
Running url 59
Running url 60
Running url 61
Running url 62
Running url 63
Running url 64
Running url 65
Running url 66
Runni

In [27]:
dr = webdriver.Edge()
dr.get("https://machinelearningmastery.com/introduction-to-1x1-convolutions-to-reduce-the-complexity-of-convolutional-neural-networks/")
soup = BeautifulSoup(dr.page_source, 'html.parser')
with open("test.html", 'w') as f:
    f.write(soup.prettify())
dr.quit()

In [33]:
content = get_content(soup)
print(content)

Pooling can be used to down sample the content of feature maps, reducing their width and height whilst maintaining their salient features.
A problem with deep convolutional neural networks is that the number of feature maps often increases with the depth of the network. This problem can result in a dramatic increase in the number of parameters and computation required when larger filter sizes are used, such as 5×5 and 7×7.
To address this problem, a 1×1 convolutional layer can be used that offers a channel-wise pooling, often called feature map pooling or a projection layer. This simple technique can be used for dimensionality reduction, decreasing the number of feature maps whilst retaining their salient features. It can also be used directly to create a one-to-one projection of the feature maps to pool features across channels or to increase the number of feature maps, such as after traditional pooling layers.
In this tutorial, you will discover how to use 1×1 filters to control the 