# Automated CAPTCHA-Based Web Scraper

This notebook demonstrates a fully automated web scraping pipeline that:

* Reads student credentials (Exam Registration Number and Password) from an Excel file.
* Launches a Chrome browser using Selenium and optionally applies proxy authentication.
* Navigates to a login-protected website that uses CAPTCHA.
* Captures and solves the CAPTCHA using OCR (`pytesseract`) with multiple image preprocessing techniques.
* Logs into the website with student credentials and the solved CAPTCHA.
* Extracts result data such as Candidate Name, Rank, and Score.
* Stores all scraped data into a structured CSV file (`results.csv`).
* Marks failed attempts (due to CAPTCHA errors or incorrect credentials) clearly in the output.

## Features:

* Uses `webdriver-manager` for automatic ChromeDriver installation (no manual setup needed).
* Supports optional proxy authentication via a dynamically created Chrome extension.
* CAPTCHA solving using enhanced image processing and Tesseract OCR.
* Input: Excel file (`Data.xlsx`) containing student login information.
* Output: CSV file with extracted results.
* Includes retry logic for CAPTCHA failures and handles login errors gracefully.

> Note: All website-specific XPaths and the login URL must be updated to match the actual target site.


In [None]:
import time
import pytesseract
import pandas as pd
import io
import zipfile

from PIL import Image, ImageFilter, ImageOps, ImageEnhance
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ===================== CONFIGURATION =====================
use_proxy = False  # Set to False if not using proxy

proxy_details = {
    "host": "123.123.123.123",  # Replace with your proxy host
    "port": "8080",             # Replace with your proxy port
    "user": "proxy_user",       # Replace with your proxy username
    "pass": "proxy_password"    # Replace with your proxy password
}

# Set path to your installed Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# ===================== PROXY HANDLING =====================
def create_proxy_extension(proxy_host, proxy_port, proxy_user, proxy_pass):
    manifest_json = """
    {
        "version": "1.0.0",
        "manifest_version": 2,
        "name": "Chrome Proxy",
        "permissions": [
            "proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking"
        ],
        "background": {
            "scripts": ["background.js"]
        }
    }
    """

    background_js = f"""
    var config = {{
        mode: "fixed_servers",
        rules: {{
            singleProxy: {{
                scheme: "http",
                host: "{proxy_host}",
                port: parseInt({proxy_port})
            }},
            bypassList: ["localhost"]
        }}
    }};

    chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});

    chrome.webRequest.onAuthRequired.addListener(
        function(details) {{
            return {{
                authCredentials: {{
                    username: "{proxy_user}",
                    password: "{proxy_pass}"
                }}
            }};
        }},
        {{urls: ["<all_urls>"]}},
        ["blocking"]
    );
    """

    plugin_file = 'proxy_auth_plugin.zip'
    with zipfile.ZipFile(plugin_file, 'w') as zp:
        zp.writestr("manifest.json", manifest_json)
        zp.writestr("background.js", background_js)
    return plugin_file

# ===================== CAPTCHA SOLVER =====================
def solve_captcha(captcha_element):
    image = Image.open(io.BytesIO(captcha_element.screenshot_as_png))
    image = image.convert("L")
    image = ImageOps.invert(image)
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)
    image = image.point(lambda x: 0 if x < 140 else 255, '1')
    image = image.filter(ImageFilter.MedianFilter())
    image = image.resize((image.width * 2, image.height * 2))
    captcha_text = pytesseract.image_to_string(image, config='--psm 8').strip()
    captcha_text = ''.join(filter(str.isalnum, captcha_text))
    return captcha_text

# ===================== BROWSER SETUP =====================
options = Options()
# options.add_argument("--headless")  # Optional: Uncomment to run without UI

if use_proxy:
    proxy_plugin = create_proxy_extension(
        proxy_details["host"],
        proxy_details["port"],
        proxy_details["user"],
        proxy_details["pass"]
    )
    options.add_extension(proxy_plugin)

service = Service(ChromeDriverManager().install())
browser = webdriver.Chrome(service=service, options=options)

# ===================== LOGIN + SCRAPE FUNCTION =====================
def login_and_fetch_results(student):
    while True:
        try:
            if len(str(student['Exam Registration Number'])) != 12:
                print(f"Skipping Application No: {student['Exam Registration Number']} (not 12 digits)")
                return None

            # Replace with actual website URL
            browser.get("https://example.com/login")

            # ================= Replace the following XPaths with actual ones =================
            WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="reg_no"]')))  # XPath for Reg No
            browser.find_element(By.XPATH, '//*[@id="reg_no"]').send_keys(student['Exam Registration Number'])

            browser.find_element(By.XPATH, '//*[@id="password"]').send_keys(student['Password'])  # XPath for password

            captcha_element = WebDriverWait(browser, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="captcha_img"]'))  # XPath for captcha image
            )
            captcha_text = solve_captcha(captcha_element)
            print(f"CAPTCHA solved: {captcha_text}")

            if len(captcha_text) != 6:
                print(f"Bad CAPTCHA ({captcha_text}). Refreshing page...")
                browser.refresh()
                continue

            captcha_input=WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="Captcha1"]'))) # XPath for captcha input
            captcha_input.clear() 
            captcha_input.send_keys(captcha_text)
            browser.find_element(By.XPATH, '//*[@id="login_button"]').click()  # XPath for login button
            # If CAPTCHA failed, retry
            try:
                WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//span[contains(text(), "Invalid CAPTCHA")]')))
                print("Invalid CAPTCHA detected. Retrying...")
                continue
            except:
                pass

            WebDriverWait(browser, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="result_table"]'))  # XPath for result table
            )

            result_data = {
                "Exam Registration Number": student['Exam Registration Number'],
                "Candidate Name": browser.find_element(By.XPATH, '//*[@id="candidate_name"]').text.strip(),  # XPath for name
                "Rank": browser.find_element(By.XPATH, '//*[@id="rank"]').text.strip(),                      # XPath for rank
                "Score": browser.find_element(By.XPATH, '//*[@id="score"]').text.strip()                     # XPath for score
            }

            return result_data

        except Exception as e:
            print(f"Error for {student['Exam Registration Number']}: {e}")
            return None
# ===================== MAIN EXECUTION =====================
try:
    students_df = pd.read_excel("Data.xlsx")  # Input Excel file with 'Exam Registration Number' and 'Password'
except FileNotFoundError:
    print("Error: Data.xlsx not found. Please place the file in the same directory.")
    browser.quit()
    exit()

results = []

for idx, student in students_df.iterrows():
    print(f"Processing student {idx + 1}/{len(students_df)}...")
    result = login_and_fetch_results(student)
    if result:
        results.append(result)
    else:
        results.append({
            "Exam Registration Number": student["Exam Registration Number"],
            "Candidate Name": "Failed",
            "Rank": "N/A",
            "Score": "N/A"
        })

results_df = pd.DataFrame(results)
results_df.to_csv("results.csv", index=False)
browser.quit()
print("Scraping completed. Results saved to results.csv.")