In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Function to scrape CBR/Anime
def scrape_cbr_anime():
    print("CBR/Anime:")
    base_url = 'https://www.cbr.com/'

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://www.cbr.com/category/anime/'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the titles, links, and dates of the articles
        article_blocks = soup.find_all('div', class_='w-display-card-content')  # Replace with the actual HTML element and class name
        for block in article_blocks:
            title_element = block.find('h5', class_='display-card-title').find('a')
            title = title_element.text.strip()
            relative_link = title_element['href']  # Get the relative link
            full_link = base_url + relative_link  # Add the base URL to the relative link
            date_element = block.find('time', class_='display-card-date')['datetime']
            date = date_element.split('T')[0]  # Extract date part from the datetime attribute
            print(f"Title: {title}")
            print(f"Link: {full_link}")  # Print the full link
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

# Function to scrape Hashnode/Data Science
def scrape_hashnode_data_science():
    print("\nHashnode/Data Science:")
    base_url = 'https://hashnode.com/n/data-science'

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://hashnode.com/n/data-science'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the post titles, URLs, and dates
        post_sections = soup.find_all('section', class_='flex flex-col gap-2 sm:gap-4')  # Replace with the actual HTML element and class name
        for section in post_sections:
            title_element = section.find('h1', class_='font-heading text-base sm:text-xl font-semibold sm:font-bold text-slate-700 dark:text-slate-200 hn-break-words cursor-pointer')
            title = title_element.text.strip()
            link_element = title_element.find_parent('a', href=True)
            link = link_element['href']
            date_element = section.find('p', class_='text-sm text-slate-500 dark:text-slate-400 font-normal')
            date = date_element.text.strip()
            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)


# Function to scrape Interesting Engineering
def scrape_interesting_engineering():
    print("\nInteresting Engineering:")

    # Base URL to prepend to relative links
    base_url = 'https://interestingengineering.com/'

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://interestingengineering.com/news/page/1'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the article titles, URLs, and date information
        article_blocks = soup.find_all('div', class_='Category_result__description__iz_rw')  # Replace with the actual HTML element and class name
        for block in article_blocks:
            title_link_element = block.find('a', href=True)
            title = title_link_element.find('h2', class_='Category_result__header__HQgVv').text.strip()
            link = urljoin(base_url, title_link_element['href'])  # Prepend base URL to relative links
            date_element = block.find('span', class_='Category_result__author__publishTime__nwLBU')
            date = date_element.text.strip()
            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)


# Function to scrape Wired/Science
def scrape_wired_science():
    print("\nWired/Science:")
    base_url = 'https://www.wired.com/category/science/'

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://www.wired.com/category/science/'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the article titles, URLs, and categories
        article_blocks = soup.find_all('div', class_='SummaryItemContent-eiDYMl')  # Replace with the actual HTML element and class name
        for block in article_blocks:
            title_element = block.find('h3', class_='SummaryItemHedBase-hiFYpQ')
            title = title_element.text.strip()
            link_element = block.find('a', class_='SummaryItemHedLink-civMjp')
            relative_link = link_element['href']  # Get the relative link
            full_link = urljoin(base_url, relative_link)  # Add the base URL to the relative link
            category_element = block.find('span', class_='RubricName-fVtemz')
            category = category_element.text.strip() if category_element else "N/A"  # Handle missing category
            print(f"Title: {title}")
            print(f"Link: {full_link}")  # Print the full link
            print(f"Category: {category}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

# Function to scrape TechCrunch
def scrape_techcrunch_startups():
    print("\nTechCrunch/Startups:")
    base_url = 'https://techcrunch.com/category/startups/'

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://techcrunch.com/category/startups/'

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the titles, links, and dates of the blog posts
        post_blocks = soup.find_all('div', class_='post-block')  # Replace with the actual HTML element and class name
        for block in post_blocks:
            title = block.find('h2', class_='post-block__title').text.strip()
            link = block.find('a', class_='post-block__title__link')['href']
            date_element = block.find('time')
            date = date_element.text  # Extract date part from the datetime attribute
            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)


# Call the scraping functions for each site
scrape_cbr_anime()
scrape_hashnode_data_science()
scrape_interesting_engineering()
scrape_wired_science()
scrape_techcrunch_startups()


CBR/Anime:
Title: The 10 Best My Hero Academia Villains & Their Birthdays, Height, and Zodiac Signs
Link: https://www.cbr.com//mha-villains-birthday-height-zodiac-signs/
Date: 2023-09-21

Title: 10 Anime Characters Who Are More Famous Than Their Actual Series
Link: https://www.cbr.com//anime-characters-more-iconic-than-their-series/
Date: 2023-09-20

Title: 10 Anime Villains Who Don't Deserve Their Bad Reputation
Link: https://www.cbr.com//anime-villains-who-deserve-better-reputations/
Date: 2023-09-20

Title: Crunchyroll Unveils Its Autumn 2023 Anime Lineup
Link: https://www.cbr.com//crunchyroll-autumn-2023-anime-lineup/
Date: 2023-09-20

Title: Dragon Ball Super Chapter 97 Recap & Spoilers: Rampaging Cell Max
Link: https://www.cbr.com//dbs-97-spoilers/
Date: 2023-09-20

Title: 10 Sci-Fi Anime That Are Just as Good as Star Wars (If Not Better)
Link: https://www.cbr.com//sci-fi-anime-like-star-wars/
Date: 2023-09-20

Title: 10 Most Controversial Anime Of All Time, Ranked
Link: https://

Title: Here are the 6 finalists of Startup Battlefield at Disrupt 2023
Link: https://techcrunch.com/2023/09/20/here-are-the-6-finalists-of-startup-battlefield-at-disrupt-2023/
Date: 
		Sep 20, 2023	

Title: Auctoria uses generative AI to create video game models
Link: https://techcrunch.com/2023/09/20/auctoria-uses-generative-ai-to-create-video-game-models/
Date: 
		Sep 20, 2023	

Title: PureSpace prevents spoiled produce by removing ripening gas
Link: https://techcrunch.com/2023/09/20/purespace-prevents-spoiled-produce-by-removing-ripening-gas/
Date: 
		Sep 20, 2023	

Title: MakersHub deciphers accounts payable data so construction companies don’t have to
Link: https://techcrunch.com/2023/09/20/makershub-accounts-payable-data-construction-battlefield/
Date: 
		Sep 20, 2023	

Title: Agtech leaders dish on untapped and overlooked opportunities for founders
Link: https://techcrunch.com/2023/09/20/agtech-leaders-dish-on-untapped-and-overlooked-opportunities-for-founders/
Date: 
		Sep 20, 

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin  # Import urljoin

def scrape_cbr_anime():
    print("--------------------------------")
    print("CBR/Anime")
    print("--------------------------------")

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://www.cbr.com/category/anime/'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the titles, links, and dates of the articles
        article_blocks = soup.find_all('div', class_='w-display-card-content')  # Replace with the actual HTML element and class name
        for block in article_blocks:
            title_element = block.find('h5', class_='display-card-title').find('a')
            title = title_element.text.strip()
            relative_link = title_element['href']  # Get the relative link
            full_link = urljoin(url, relative_link)  # Prepend base URL to relative links
            date_element = block.find('time', class_='display-card-date')['datetime']
            date = date_element.split('T')[0]  # Extract date part from the datetime attribute
            print(f"Title: {title}")
            print(f"Link: {full_link}")  # Use the full link with the base URL
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

def scrape_hashnode_data_science():
    print("--------------------------------")
    print("Hashnode/Data Science")
    print("--------------------------------")

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://hashnode.com/n/data-science'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the post titles, URLs, and dates
        post_sections = soup.find_all('section', class_='flex flex-col gap-2 sm:gap-4')  # Replace with the actual HTML element and class name
        for section in post_sections:
            title_element = section.find('h1', class_='font-heading text-base sm:text-xl font-semibold sm:font-bold text-slate-700 dark:text-slate-200 hn-break-words cursor-pointer')
            title = title_element.text.strip()
            link_element = title_element.find_parent('a', href=True)
            link = link_element['href']
            date_element = section.find('p', class_='text-sm text-slate-500 dark:text-slate-400 font-normal')
            date = date_element.text.strip()
            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

# Define similar functions for the other sites (TechCrunch, Interesting Engineering, Wired/Science)

def scrape_techcrunch_startups():
    print("--------------------------------")
    print("TechCrunch/Startups")
    print("--------------------------------")

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://techcrunch.com/category/startups/'

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the titles, links, and dates of the blog posts
        post_blocks = soup.find_all('div', class_='post-block')  # Replace with the actual HTML element and class name
        for block in post_blocks:
            title = block.find('h2', class_='post-block__title').text.strip()
            link = block.find('a', class_='post-block__title__link')['href']
            date_element = block.find('time')
            date = date_element.text  # Extract date part from the datetime attribute
            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

def scrape_interesting_engineering():
    print("--------------------------------")
    print("Interesting Engineering")
    print("--------------------------------")

    # Base URL to prepend to relative links
    base_url = 'https://interestingengineering.com/'

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://interestingengineering.com/news/page/1'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the article titles, URLs, and date information
        article_blocks = soup.find_all('div', class_='Category_result__description__iz_rw')  # Replace with the actual HTML element and class name
        for block in article_blocks:
            title_link_element = block.find('a', href=True)
            title = title_link_element.find('h2', class_='Category_result__header__HQgVv').text.strip()
            link = urljoin(base_url, title_link_element['href'])  # Prepend base URL to relative links
            author_element = block.find('a', class_='Category_result__author__name__In7jd')
            author = author_element.text.strip()
            date_element = block.find('span', class_='Category_result__author__publishTime__nwLBU')
            date = date_element.text.strip()
            print(f"Title: {title}")
            print(f"Link: {link}")
            print(f"Author: {author}")
            print(f"Date: {date}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

def scrape_wired_science():
    print("--------------------------------")
    print("Wired/Science")
    print("--------------------------------")

    # Replace 'your_url_here' with the actual URL of the website you want to scrape
    url = 'https://www.wired.com/category/science/'  # Replace with the actual URL

    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find and print the article titles, URLs, and categories
        article_blocks = soup.find_all('div', class_='SummaryItemContent-eiDYMl')  # Replace with the actual HTML element and class name
        for block in article_blocks:
            title_element = block.find('h3', class_='SummaryItemHedBase-hiFYpQ')
            title = title_element.text.strip()
            relative_link = block.find('a', class_='SummaryItemHedLink-civMjp')['href']  # Get the relative link
            full_link = urljoin(url, relative_link)  # Prepend base URL to relative links
            category_element = block.find('span', class_='RubricName-fVtemz')
            category = category_element.text.strip() if category_element else "N/A"  # Handle missing category
            print(f"Title: {title}")
            print(f"Link: {full_link}")  # Use the full link with the base URL
            print(f"Category: {category}")
            print()  # Add an empty line for better readability
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)

# Call the scraping functions for each site
scrape_cbr_anime()
scrape_hashnode_data_science()
scrape_interesting_engineering()
scrape_wired_science()
scrape_techcrunch_startups()


--------------------------------
CBR/Anime
--------------------------------
Title: 10 Best Anime You Can Watch Right Now & Where to Stream Them
Link: https://www.cbr.com/best-anime-available-right-now-where-to-watch/
Date: 2023-09-23

Title: 10 Magical Girl Anime With the Best Fight Scenes, Ranked
Link: https://www.cbr.com/magical-girl-anime-best-fight-scenes-ranked/
Date: 2023-09-23

Title: Megumi Fushiguro and His Adorable Shikigami Train Together in New Jujutsu Kaisen Collectible
Link: https://www.cbr.com/new-jjk-collectible-megumi-fushiguro-cute-shikigami-training/
Date: 2023-09-23

Title: The Eminence in Shadow Season 2 Simulcast of the English Dub Confirmed
Link: https://www.cbr.com/the-eminence-in-shadow-s2-simulcast-english-dub-confirmed/
Date: 2023-09-23

Title: 10 Lessons Dragon Ball Super Should Learn From DBGT
Link: https://www.cbr.com/lessons-dbs-should-learn-from-dbgt/
Date: 2023-09-23

Title: Butareba Anime Introduces Main Character in New Promotional Video
Link: https:/

Title: Disability tech startups kill the cynic in me
Link: https://techcrunch.com/2023/09/23/disability-tech-startup-battlefield/
Date: 
		Sep 23, 2023	

Title: Bay Area baby belly beholding Battlefield bounty
Link: https://techcrunch.com/2023/09/22/bay-area-baby-belly-beholding-battlefield-bounty/
Date: 
		Sep 22, 2023	

Title: Pitch Deck Teardown: Transcend’s $20M Series B deck
Link: https://techcrunch.com/2023/09/22/sample-series-b-pitch-deck-transcend/
Date: 
		Sep 22, 2023	

Title: TechCrunch+ Roundup: Prompt engineering, web3 gaming survey, how to spend $10K on paid ads
Link: https://techcrunch.com/2023/09/22/techcrunch-roundup-prompt-engineering-web3-gaming-survey-how-to-spend-10k-on-paid-ads/
Date: 
		Sep 22, 2023	

Title: South Africa’s FinanceGPT simplifies financial analysis, set to interface in local languages
Link: https://techcrunch.com/2023/09/22/financegpt-generative-ai-tools-for-financial-analysis/
Date: 
		Sep 22, 2023	

Title: HME Square aims to measure glucose painl