In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import concurrent.futures
import time

def extract_email_addresses_from_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        email_addresses = re.findall(r'(?:(?:mailto\:)?[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,})', text)
        email_addresses = list(set([email.replace('mailto:', '') for email in email_addresses]))
        return email_addresses
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching data from {url}: {e}")
        return []

def scrape_email_addresses(url, max_depth=2, current_depth=0):
    if current_depth > max_depth:
        return []

    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        email_addresses = re.findall(r'(?:(?:mailto\:)?[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,})', text)

        if email_addresses:
            print(f"Scraped {url}, Email Addresses Found: {email_addresses}")
            return email_addresses

        print(f"No email address found on {url}")
        time.sleep(1)  # ウェブサイトに負荷をかけないように1秒待機

        links = soup.find_all('a', href=True)
        for link in links:
            absolute_link = urljoin(url, link['href'])
            extracted_emails = scrape_email_addresses(absolute_link, max_depth, current_depth + 1)
            if extracted_emails:
                return extracted_emails  # 1つでもメールアドレスが見つかったら終了

        return []
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching data from {url}: {e}")
        return []

def main():
    # Excelファイルのパスを指定
    csv_file_path = 'test.csv'

    # ExcelファイルからURLを読み取る
    df = pd.read_csv(csv_file_path)

    # マルチスレッドでクローリングし、メールアドレスを収集する
    email_results = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(scrape_email_addresses, url): url for url in df['URL']}  # 'URL列名'は実際のExcelファイルの列名に置き換えてください

        for future in concurrent.futures.as_completed(futures):
            url = futures[future]
            email_addresses = future.result()
            email_addresses_str = ', '.join(email_addresses)
            df.loc[df['URL'] == url, 'メールアドレス'] = email_addresses_str

    # DataFrameをExcelファイルに保存する
    df.to_csv(csv_file_path, index=False)

if __name__ == "__main__":
    main()


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import concurrent.futures
import time

def extract_email_addresses_from_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        email_addresses = re.findall(r'(?:(?:mailto\:)?[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,})', text)
        email_addresses = list(set([email.replace('mailto:', '') for email in email_addresses]))
        return email_addresses
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching data from {url}: {e}")
        return []

def scrape_email_addresses(url, max_depth=2, current_depth=0):
    if current_depth > max_depth:
        return []

    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        email_addresses = re.findall(r'(?:(?:mailto\:)?[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,})', text)

        if email_addresses:
            print(f"Scraped {url}, Email Addresses Found: {email_addresses}")
            return email_addresses

        print(f"No email address found on {url}")
        time.sleep(1)  # ウェブサイトに負荷をかけないように1秒待機

        links = soup.find_all('a', href=True)
        for link in links:
            absolute_link = urljoin(url, link['href'])
            extracted_emails = scrape_email_addresses(absolute_link, max_depth, current_depth + 1)
            if extracted_emails:
                return extracted_emails  # 1つでもメールアドレスが見つかったら終了

        return []
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching data from {url}: {e}")
        return []

def main():
    # Excelファイルのパスを指定
    csv_file_path = 'test.csv'

    # ExcelファイルからURLを読み取る
    df = pd.read_csv(csv_file_path)

    # マルチスレッドでクローリングし、メールアドレスを収集する
    email_results = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(scrape_email_addresses, url): url for url in df['URL']}  # 'URL列名'は実際のExcelファイルの列名に置き換えてください

        for future in concurrent.futures.as_completed(futures):
            url = futures[future]
            email_addresses = future.result()
            email_addresses_str = ', '.join(email_addresses)
            df.loc[df['URL'] == url, 'メールアドレス'] = email_addresses_str

    # DataFrameをExcelファイルに保存する
    df.to_csv(csv_file_path, index=False)

if __name__ == "__main__":
    main()
