Run this first: pip install requests beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urlparse

In [None]:
# Function to save webpage content to a CSV file
def save_to_csv(title, content, url, csv_file):
    # Open the CSV file in append mode
    with open(csv_file, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the data to the CSV file
        writer.writerow([orgId, title, content, url])

# Function to scrape and save webpage content
def scrape_and_save(url, csv_file):
    try:
        # Send a GET request to the URL
        headers = {
        'User-Agent': 'Chrome/90.0.4430.93'
        }
        response = requests.get(url, headers=headers)
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract title of the page
        title = soup.title.string if soup.title else "No Title"
        # Extract text content of the page
        content = soup.get_text()
        # Remove leading/trailing whitespace and replace newlines with spaces
        content = ' '.join(content.strip().split('\n'))
        # Save data to CSV file
        save_to_csv(title, content, url, csv_file)
    except requests.exceptions.RequestException as e:
        print(f"Failed to scrape {url}: {e}")
# Function to crawl a website
def crawl_website(original_url, start_url, depth, csv_file, visited=None):
    if visited is None:
        visited = set()

    if start_url in visited:
        return
    
    visited.add(start_url)

    try:
        # print("Sleeping for 1 second...")
        time.sleep(1)
        # Crawl the start URL
        if not is_same_domain(original_url, start_url):
            print(f"Skipping {start_url}")
            return
            
        print("Crawling URL:", start_url)
        scrape_and_save(start_url, csv_file)

        if depth > 0:
            # Send a GET request to the URL
            headers = {
                'User-Agent': 'Chrome/90.0.4430.93'
            }
            response = requests.get(start_url,headers=headers)
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all links on the page
            links = soup.find_all('a', href=True)
            for link in links:
                absolute_url = link['href']
                # Recursively crawl the links
                crawl_website(original_url, absolute_url, depth - 1, csv_file, visited)
    except requests.exceptions.RequestException as e:
        print(f"Failed to crawl {start_url}: {e}")

def is_same_domain(original_url, new_url):
    try:
        original_domain = urlparse(original_url).netloc
        new_domain = urlparse(new_url).netloc
        return original_domain == new_domain
    except Exception as e:
        print(f"Error parsing URLs: {e}")
        return False

In [None]:
start_urls = [
    "https://www.healthline.com/health/fertility/fertility-monitor",
    "https://kegg.tech/blogs/learn/mira-vs-kegg-what-s-the-best-fertility-tracker",
    "https://www.mmnfp.com/mira",
    "https://www.innerbody.com/mira-review",
    "https://casadesante.com/blogs/wellness/ava-vs-mira-vs-ovusense",
    "https://finvsfin.com/ava-vs-mira-vs-ovusense/",
    "https://www.healthline.com/health/birth-control/the-daysy-birth-control-and-fertility-tracker-review-2022",
    "https://www.health.com/best-fertility-monitors-6951641",
    "https://payitforwardfertility.org/best-fertility-monitor/",
    ]
depth = 0
csv_file = "industry_fertility.csv"
orgId = 'mira'

# Open the CSV file and write the header row
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['organization_id', 'title', 'content', 'url'])
for start_url in start_urls:
    crawl_website(start_url, start_url, depth, csv_file)