In [11]:
base_url = 'https://www.lfmgcpas.com/'

In [15]:
import requests
from bs4 import BeautifulSoup
import csv
import re
from urllib.parse import urljoin
import datetime

# Define a function to scrape a single web page for email addresses and names
def scrape_page(url):
    # Send a GET request to the URL
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers = headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        return None
    
def find_internal_links(soup, base_url):
    links = []
    excluded_keywords = ['subscribe','portal','reports','google','#','?']
    for a in soup.find_all('a', href=True):
        link = urljoin(base_url, a['href'])
        # Exclude links to image files
        if link.endswith('.jpg') or link.endswith('.jpeg') or link.endswith('.png') or link.endswith('.gif') or link.endswith('.pdf'):
            continue
        if any(keyword in link for keyword in excluded_keywords):
            continue
        # if not link.endswith('/'):
        #     continue
        if link.startswith(base_url):
            links.append(link)
    return links

def extract_emails_and_first_names(soup, scraped_emails):
    links = soup.find_all('a', href=True)
    emails = []
    email_pattern = r'\b(?!hr@|sales@)[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    for link in links:
        href = link.get('href')
        if href and href.startswith('mailto:'):
            email = href.replace('mailto:', '').strip()
            if re.match(email_pattern, email) and email not in scraped_emails:
                first_name = email.split('@')[0].split('.')[0]
                emails.append((first_name, email))
    
    if not emails:
        email_pattern = r'\b(?!hr@|sales@)[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        paragraphs = soup.find_all('p')
        paragraph_texts = ' '.join([p.get_text() for p in paragraphs])
        emails = re.findall(email_pattern, paragraph_texts)
        first_names = [email.split('@')[0].split('.')[0] for email in emails]
        emails = [(first_name, email) for first_name, email in zip(first_names, emails) if email not in scraped_emails]
    
    return emails

def main():
    today = datetime.datetime.now().strftime('%Y-%m-%d')
    match = re.search(r"www\.(.*?)\.com", base_url)
    if match:
        company_name = match.group(1)
    csv_filename = f'{company_name}{today}_emails.csv'
    visited_links = set()
    links_to_visit = [base_url]
    scraped_emails = set()  # Keep track of unique emails
    
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['First Name', 'Email'])
        
        while links_to_visit:
            link = links_to_visit.pop(0)
            if link not in visited_links:
                print(f"Scraping {link}")
                soup = scrape_page(link)
                if soup:
                    visited_links.add(link)
                    links_to_visit.extend(find_internal_links(soup, base_url))
                    emails_and_first_names = extract_emails_and_first_names(soup, scraped_emails)
                    for first_name, email in emails_and_first_names:
                        if email not in scraped_emails:  # Check if email has already been scraped
                            writer.writerow([first_name, email])
                            scraped_emails.add(email)  # Add email to set of scraped emails

                        
if __name__ == "__main__":
    main()


Scraping https://www.lfmgcpas.com/
Scraping https://www.lfmgcpas.com/firmprofile.php
Scraping https://www.lfmgcpas.com/custom.php
Scraping https://www.lfmgcpas.com/custom2.php
Scraping https://www.lfmgcpas.com/custom3.php
Scraping https://www.lfmgcpas.com/client-reviews.php
Scraping https://www.lfmgcpas.com/services.php
Scraping https://www.lfmgcpas.com/custom5.php
Scraping https://www.lfmgcpas.com/taxprepplan.php
Scraping https://www.lfmgcpas.com/estate.php
Scraping https://www.lfmgcpas.com/accountingservices.php
Scraping https://www.lfmgcpas.com/lfmgservices.php
Scraping https://www.lfmgcpas.com/traditional.php
Scraping https://www.lfmgcpas.com/nontraditional.php
Scraping https://www.lfmgcpas.com/newsletter.php
Scraping https://www.lfmgcpas.com/archive.php
Scraping https://www.lfmgcpas.com/dailynews.php
Scraping https://www.lfmgcpas.com/life-events.php
Scraping https://www.lfmgcpas.com/business-strategies.php
Scraping https://www.lfmgcpas.com/taxstrategies-businessowners.php
Scraping



Scraping https://www.lfmgcpas.com/about.php
Scraping https://www.lfmgcpas.com/margaret.php
Scraping https://www.lfmgcpas.com/cynthia.php
Scraping https://www.lfmgcpas.com/Sherri.php
Scraping https://www.lfmgcpas.com/personalfinplan.php
Scraping https://www.lfmgcpas.com/estateplan.php
Scraping https://www.lfmgcpas.com/eldercare.php
Scraping https://www.lfmgcpas.com/smallbiz.php
Scraping https://www.lfmgcpas.com/qbmain.php
Scraping https://www.lfmgcpas.com/whyquickbooks.php
Scraping https://www.lfmgcpas.com/quickbookssetup.php
Scraping https://www.lfmgcpas.com/qbtraining.php
Scraping https://www.lfmgcpas.com/quickanswers.php
Scraping https://www.lfmgcpas.com/quicktuneup.php
Scraping https://www.lfmgcpas.com/quickbookstips.php
Scraping https://www.lfmgcpas.com/buyquickbooks.php
Scraping https://www.lfmgcpas.com/payrollservice.php
Scraping https://www.lfmgcpas.com/cfoservices.php
Scraping https://www.lfmgcpas.com/auditing.php
Scraping https://www.lfmgcpas.com/cashmanagement.php
Scraping ht