<a href="https://colab.research.google.com/github/vivekab5/Email-Scraper-Pro/blob/master/Email_Extractor_Vivek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import csv
from html import unescape
from urllib.parse import urljoin, urlparse
import time
import random
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from google.colab import files
import io

HEADERS = {'User-Agent': 'Mozilla/5.0'}
EMAIL_REGEX = r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.(?:com|net|at|us|co|info|biz|me|tech|in|uk|ca|de|fr|au|nl|jp|ru|cn|br|es|it|ch|se|no|mx|studio)\b"
FALLBACK_PATHS = [
    "/contact", "/privacy", "/privacy-policy", "/contact-us",
    "/pages/contact", "/pages/contact-us", "/pages/privacy-policy"
]
PROXIES = []

def get_random_proxy():
    if PROXIES:
        proxy = random.choice(PROXIES)
        return {"http": proxy, "https": proxy}
    return None

def normalize_url(url):
    if not urlparse(url).scheme:
        return "http://" + url
    return url

def filter_valid_emails(emails):
    image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg")
    excluded_domains = {"sentry.wixpress.com"}
    valid_emails = []
    for email in emails:
        if email.lower().endswith(image_extensions):
            continue
        domain = email.split("@")[-1].lower()
        if domain not in excluded_domains:
            valid_emails.append(email)
    return valid_emails

def extract_emails_from_soup(soup):
    combined_text = soup.get_text() + soup.prettify()
    decoded_text = unescape(combined_text)

    normal_emails = re.findall(EMAIL_REGEX, decoded_text)
    obfuscated = re.findall(r"[a-zA-Z0-9_.+-]+\s?\[at\]\s?[a-zA-Z0-9-]+\s?\[dot\]\s?[a-zA-Z0-9-.]+", decoded_text)

    obfuscated_fixed = [
        email.replace("[at]", "@").replace("[dot]", ".").replace(" ", "")
        for email in obfuscated
    ]

    all_emails = list(set(normal_emails + obfuscated_fixed))
    return filter_valid_emails(all_emails)

def scrape_emails_from_url(url):
    session = requests.Session()
    proxy = get_random_proxy()
    try:
        time.sleep(random.uniform(1, 3))
        res = session.get(url, headers=HEADERS, proxies=proxy, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")
        emails = extract_emails_from_soup(soup)

        if emails:
            return (url, emails)

        for path in FALLBACK_PATHS:
            fallback_url = urljoin(url, path)
            time.sleep(random.uniform(1, 2))
            res = session.get(fallback_url, headers=HEADERS, proxies=proxy, timeout=10)
            soup = BeautifulSoup(res.text, "html.parser")
            emails = extract_emails_from_soup(soup)
            if emails:
                return (url, emails)

    except Exception as e:
        print(f"Error with {url}: {e}")
    return (url, [])


print("Upload a  file (websites.csv) containing website URLs (one per line):")
uploaded = files.upload()
input_file = next(iter(uploaded))
websites = [normalize_url(line.strip()) for line in uploaded[input_file].decode("utf-8").splitlines() if line.strip()]


output_file = "emails_found.csv"
max_threads = 10


print(f"\nScraping {len(websites)} websites with {max_threads} threads...\n")

with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Website", "Emails Found"])

with ThreadPoolExecutor(max_workers=max_threads) as executor:
    future_to_url = {executor.submit(scrape_emails_from_url, url): url for url in websites}

    for idx, future in enumerate(as_completed(future_to_url), 1):
        url, emails = future.result()
        print(f"[{idx}/{len(websites)}] {url} — {'Found' if emails else 'No emails'}")

        with open(output_file, "a", newline="", encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([url, ", ".join(emails) if emails else "No emails found"])


print("\nScraping complete!")

files.download(output_file)




Upload a  file (websites.csv) containing website URLs (one per line):


KeyboardInterrupt: 