In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
from openpyxl import Workbook
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException, ElementClickInterceptedException
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# Optional: Install selenium-stealth if needed (uncomment to use)
# pip install selenium-stealth
try:
    from selenium_stealth import stealth
except ImportError:
    stealth = None
    print("Warning: selenium-stealth not installed. Some anti-bot measures may still block the script.")


In [3]:
# Read the input Excel file
df = pd.read_excel("D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/resumebuilderITlink.xlsx")

In [4]:
# Initialize lists to store results
all_results = []

In [5]:
# Function to clean text: remove newlines, extra spaces, and special characters
def clean_text(text):
    # Remove newlines, tabs, and special characters
    text = re.sub(r'[\n\r\t]+', ' ', text)
    # Remove non-breaking spaces and other special characters
    text = re.sub(r'\xa0', ' ', text)
    # Collapse multiple spaces into a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing spaces
    return text.strip()

In [6]:
# Function to set up WebDriver with anti-bot measures
def setup_driver():
    chrome_options = Options()
    # Use real browser (non-headless mode) with visibility
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    
    # Set up the driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    # Apply stealth settings if available
    if stealth:
        stealth(driver,
                languages=["en-US", "en"],
                vendor="Google Inc.",
                platform="Win32",
                webgl_vendor="Intel Inc.",
                renderer="Intel Iris OpenGL Engine",
                fix_hairline=True,
        )
    
    return driver

# Function to handle pop-ups (e.g., cookie banners)
def handle_popups(driver, url):
    try:
        # Look for common cookie banner buttons
        accept_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept') or contains(text(), 'Agree') or contains(text(), 'OK')]"))
        )
        accept_button.click()
        print(f"Info: Closed cookie banner for {url}")
        time.sleep(1)  # Wait for the banner to close
    except (NoSuchElementException, TimeoutException):
        print(f"Info: No cookie banner found for {url}")
    except Exception as e:
        print(f"Warning: Error closing pop-up for {url}: {str(e)}")

# Function to crawl resume content from a single tab
def crawl_resume_tab(tab, driver):
    resume_content = []
    
    # Find all direct children that are <p>, <span class="tab-bold">, or <ul> tags
    elements = tab.find_elements(By.XPATH, "./*[self::p or self::span[@class='tab-bold'] or self::ul]")
    
    for element in elements:
        # Extract <span class="tab-bold"> (includes content from child tags like <strong>)
        if element.tag_name == "span" and "tab-bold" in element.get_attribute("class"):
            text = clean_text(element.text)
            if text:
                resume_content.append(text)
        
        # Extract <p> (includes content from child tags like <strong>, <u>)
        elif element.tag_name == "p":
            text = clean_text(element.text)
            if text:
                resume_content.append(text)
        
        # Extract from <ul> (includes <li> and their child content)
        elif element.tag_name == "ul":
            try:
                li_items = element.find_elements(By.TAG_NAME, "li")
                for li in li_items:
                    text = clean_text(li.text)
                    if text:
                        resume_content.append(text)
            except NoSuchElementException:
                pass
    
    # Join all content with a single space
    return " ".join(resume_content)

# Function to crawl all resume tabs from a single URL
def crawl_resume_content(url, category, max_retries=1):
    resume_contents = []
    for attempt in range(max_retries + 1):
        driver = None
        try:
            # Set up WebDriver
            driver = setup_driver()
            
            # Navigate to the URL
            driver.get(url)
            print(f"Info: Successfully loaded {url} on attempt {attempt + 1}")
            
            # Wait for the main wrapper to load (timeout set to 20 seconds)
            try:
                resumes_tabbing_wrap = WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "resumes-tabbing-wrap.tabbed"))
                )
                print(f"Info: Resumes tabbing wrap detected for {url}")
            except TimeoutException:
                # Check for CAPTCHA or anti-bot page
                try:
                    captcha = driver.find_element(By.XPATH, "//*[contains(text(), 'CAPTCHA') or contains(text(), 'verify') or contains(text(), 'robot')]")
                    print(f"Warning: CAPTCHA detected for {url}. Please solve the CAPTCHA manually in the browser window.")
                    time.sleep(30)  # Pause to allow manual CAPTCHA solving
                    # Re-check for content after CAPTCHA solving
                    resumes_tabbing_wrap = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "resumes-tabbing-wrap.tabbed"))
                    )
                    print(f"Info: Resumes tabbing wrap detected after CAPTCHA solving for {url}")
                except (NoSuchElementException, TimeoutException):
                    print(f"Error: Timeout waiting for resumes tabbing wrap for {url} after {attempt + 1} attempts.")
                    if attempt == max_retries:
                        driver.quit()
                        return []
                    continue
            
            # Handle pop-ups (e.g., cookie banners)
            handle_popups(driver, url)
            
            # Try to find <div class="resumes-content-tabs tabbing-block   tabbing-slider">
            try:
                resumes_content_tabs = resumes_tabbing_wrap.find_element(By.CLASS_NAME, "resumes-content-tabs.tabbing-block.tabbing-slider")
                print(f"Info: Found 'resumes-content-tabs tabbing-block tabbing-slider' for {url}")
            except NoSuchElementException:
                # If not found, try <div class="resumes-content-tabs tabbing-block">
                try:
                    resumes_content_tabs = resumes_tabbing_wrap.find_element(By.CLASS_NAME, "resumes-content-tabs.tabbing-block")
                    print(f"Info: Found 'resumes-content-tabs tabbing-block' for {url}")
                except NoSuchElementException:
                    print(f"Error: Could not find 'resumes-content-tabs tabbing-block tabbing-slider' or 'resumes-content-tabs tabbing-block' for {url}.")
                    driver.quit()
                    return []
            
            # Navigate to <div class="resume-tabs-wrap">
            try:
                resume_tabs_wrap = resumes_content_tabs.find_element(By.CLASS_NAME, "resume-tabs-wrap")
            except NoSuchElementException:
                print(f"Error: Could not find 'resume-tabs-wrap' for {url}.")
                driver.quit()
                return []
            
            # Find <div class="slider-wrapper"> (sibling of resume-tabs-wrap)
            try:
                slider_wrapper = resumes_content_tabs.find_element(By.CLASS_NAME, "slider-wrapper")
            except NoSuchElementException:
                print(f"Error: Could not find 'slider-wrapper' for {url}.")
                driver.quit()
                return []
            
            # Find all <li> elements in <ul class="tabs swiper-wrapper">
            try:
                tab_buttons = slider_wrapper.find_elements(By.XPATH, ".//ul[contains(@class, 'tabs swiper-wrapper')]/li")
                print(f"Info: Found {len(tab_buttons)} tab buttons for {url}")
            except NoSuchElementException:
                print(f"Error: Could not find tab buttons in 'tabs swiper-wrapper' for {url}.")
                driver.quit()
                return []
            
            # Dictionary to store content by tab ID to avoid duplicates
            crawled_tabs = set()
            
            # Iterate through each tab button and click to activate the corresponding tab
            for idx, tab_button in enumerate(tab_buttons):
                max_click_attempts = 2
                for click_attempt in range(max_click_attempts):
                    try:
                        # Get the tab ID from data-tab attribute
                        tab_id = tab_button.get_attribute("data-tab")
                        if not tab_id:
                            print(f"Warning: Tab button {idx + 1} has no data-tab attribute for {url}, skipping.")
                            break
                        
                        # Skip if this tab has already been crawled
                        if tab_id in crawled_tabs:
                            break
                        
                        # Scroll to the tab button and click
                        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", tab_button)
                        time.sleep(0.5)  # Small delay to ensure the element is in view
                        try:
                            tab_button.click()
                            print(f"Info: Clicked tab {idx + 1} with ID {tab_id} for {url}")
                        except ElementClickInterceptedException:
                            print(f"Warning: Could not click tab {idx + 1} with ID {tab_id} for {url}, trying JavaScript click.")
                            driver.execute_script("arguments[0].click();", tab_button)
                        
                        # Wait for the corresponding tab to become visible (increased timeout to 10 seconds)
                        try:
                            tab = WebDriverWait(driver, 10).until(
                                EC.presence_of_element_located((By.ID, tab_id))
                            )
                            # Ensure the tab is visible
                            WebDriverWait(driver, 10).until(
                                lambda d: tab.value_of_css_property("display") == "block"
                            )
                            print(f"Info: Tab {tab_id} is now visible for {url}")
                        except TimeoutException:
                            print(f"Error: Timeout waiting for tab {tab_id} to become visible for {url} on click attempt {click_attempt + 1}.")
                            if click_attempt == max_click_attempts - 1:
                                print(f"Error: Skipping tab {tab_id} after {max_click_attempts} click attempts for {url}.")
                                break
                            continue
                        
                        # Crawl the content of the tab
                        content = crawl_resume_tab(tab, driver)
                        if content:
                            resume_contents.append({"Category": category, "Resume": content})
                            crawled_tabs.add(tab_id)
                            print(f"Info: Successfully crawled tab {tab_id} for {url}")
                        
                        # Small delay to allow the page to stabilize
                        time.sleep(1)
                        break  # Exit click retry loop on success
                    
                    except Exception as e:
                        print(f"Error processing tab {idx + 1} for {url} on click attempt {click_attempt + 1}: {str(e)}")
                        if click_attempt == max_click_attempts - 1:
                            print(f"Error: Skipping tab {idx + 1} after {max_click_attempts} click attempts for {url}.")
                        continue
            
            driver.quit()
            return resume_contents
        
        except WebDriverException as e:
            print(f"Error on attempt {attempt + 1} for {url}: {str(e)}")
            if attempt == max_retries:
                print(f"Error: Failed to crawl {url} after {max_retries + 1} attempts.")
                if driver:
                    driver.quit()
                return []
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
            if attempt == max_retries:
                print(f"Error: Failed to crawl {url} after {max_retries + 1} attempts.")
                if driver:
                    driver.quit()
                return []
        finally:
            if driver:
                try:
                    driver.quit()
                except:
                    pass

# Function to process a single row (for parallel execution)
def process_row(row):
    category = row["Category"]
    url = row["Resume_link"]
    print(f"Crawling resume for {category}...")
    resume_contents = crawl_resume_content(url, category, max_retries=1)
    return resume_contents

In [7]:
# Use ThreadPoolExecutor for parallel crawling (2 workers for stability)
with ThreadPoolExecutor(max_workers=2) as executor:
    # Submit all tasks
    future_to_row = {executor.submit(process_row, row): row for _, row in df.iterrows()}
    
    # Collect results as they complete
    for future in as_completed(future_to_row):
        results = future.result()
        all_results.extend(results)
        # Small delay to avoid overwhelming the server
        time.sleep(0.5)

Crawling resume for Amazon Web Services Engineer...
Crawling resume for Android Developer...
Info: Successfully loaded https://www.resumebuilder.com/resume-examples/amazon-web-services/ on attempt 1
Info: Resumes tabbing wrap detected for https://www.resumebuilder.com/resume-examples/amazon-web-services/
Info: Successfully loaded https://www.resumebuilder.com/resume-examples/android-developer/ on attempt 1
Info: Resumes tabbing wrap detected for https://www.resumebuilder.com/resume-examples/android-developer/
Info: No cookie banner found for https://www.resumebuilder.com/resume-examples/amazon-web-services/
Info: Found 'resumes-content-tabs tabbing-block tabbing-slider' for https://www.resumebuilder.com/resume-examples/amazon-web-services/
Info: Found 7 tab buttons for https://www.resumebuilder.com/resume-examples/amazon-web-services/
Info: No cookie banner found for https://www.resumebuilder.com/resume-examples/android-developer/
Info: Found 'resumes-content-tabs tabbing-block' for ht

In [8]:
# Create a DataFrame with the results
result_df = pd.DataFrame(all_results)

# Ensure the DataFrame has the correct columns in the right order
result_df = result_df[["Category", "Resume"]]

In [9]:
result_df.head(50)

Unnamed: 0,Category,Resume
0,Android Developer,"Skyler Thompson 123 W 15th Street, Minneapolis..."
1,Android Developer,"Mina Sayed 123 Bedford Avenue, New York, NY 12..."
2,Android Developer,"Sarah Johnson 123 Carpenter Street, Philadelph..."
3,Amazon Web Services Engineer,James Mitchell (123) 456-7890 | jamesmitchell@...
4,Amazon Web Services Engineer,Aliya Jackson (123) 456-7890 | aliyajackson@ex...
5,Amazon Web Services Engineer,Michael Johnson (123) 456-7890 | michaeljohnso...
6,Amazon Web Services Engineer,Emily Roberts (123) 456-7890 | emilyroberts@ex...
7,Amazon Web Services Engineer,Cameron Malfara (123) 456-7890 cameronmalfara@...
8,Amazon Web Services Engineer,Raheem Richardson (123) 456-7890 raheemrichard...
9,Amazon Web Services Engineer,Sarah Williams (123) 456-7890 sarahwilliams@ex...


In [10]:
# Save to Excel
result_df.to_excel("D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/NewITData/finalresumebuilderITlink.xlsx", index=False)
print("Crawling complete. Results saved to Crawled_Resumes.xlsx")

Crawling complete. Results saved to Crawled_Resumes.xlsx
