In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
from openpyxl import Workbook
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# Optional: Install selenium-stealth if needed (uncomment to use)
# pip install selenium-stealth
try:
    from selenium_stealth import stealth
except ImportError:
    stealth = None
    print("Warning: selenium-stealth not installed. Some anti-bot measures may still block the script.")

In [3]:
# Read the input Excel file
df = pd.read_excel("D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/ResumelabITlink.xlsx")

In [4]:
# Initialize lists to store results
results = []

In [5]:
# Function to clean text: remove newlines, extra spaces, and special characters
def clean_text(text):
    # Remove newlines, tabs, and special characters
    text = re.sub(r'[\n\r\t]+', ' ', text)
    # Remove non-breaking spaces and other special characters
    text = re.sub(r'\xa0', ' ', text)
    # Collapse multiple spaces into a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing spaces
    return text.strip()

In [6]:
# Function to set up WebDriver with anti-bot measures
def setup_driver():
    chrome_options = Options()
    # Use real browser (non-headless mode) with visibility
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    
    # Set up the driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    # Apply stealth settings if available
    if stealth:
        stealth(driver,
                languages=["en-US", "en"],
                vendor="Google Inc.",
                platform="Win32",
                webgl_vendor="Intel Inc.",
                renderer="Intel Iris OpenGL Engine",
                fix_hairline=True,
        )
    
    return driver

# Function to crawl resume content from the specified structure
def crawl_resume_content(url, category, max_retries=1):
    for attempt in range(max_retries + 1):
        driver = None
        try:
            # Set up WebDriver
            driver = setup_driver()
            
            # Navigate to the URL
            driver.get(url)
            print(f"Info: Successfully loaded {url} on attempt {attempt + 1}")
            
            # Wait for the main wrapper to load (timeout set to 20 seconds)
            try:
                blog_main_wrapper = WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "blogMainWrapper"))
                )
                print(f"Info: Blog main wrapper detected for {url}")
            except TimeoutException:
                # Check for CAPTCHA or anti-bot page
                try:
                    captcha = driver.find_element(By.XPATH, "//*[contains(text(), 'CAPTCHA') or contains(text(), 'verify') or contains(text(), 'robot')]")
                    print(f"Warning: CAPTCHA detected for {url}. Please solve the CAPTCHA manually in the browser window.")
                    time.sleep(30)  # Pause to allow manual CAPTCHA solving
                    # Re-check for content after CAPTCHA solving
                    blog_main_wrapper = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "blogMainWrapper"))
                    )
                    print(f"Info: Blog main wrapper detected after CAPTCHA solving for {url}")
                except (NoSuchElementException, TimeoutException):
                    print(f"Error: Timeout waiting for blog main wrapper for {url} after {attempt + 1} attempts.")
                    if attempt == max_retries:
                        driver.quit()
                        return ""
                    continue
            
            # Navigate to <div class="blogMain">
            try:
                blog_main = blog_main_wrapper.find_element(By.CLASS_NAME, "blogMain")
            except NoSuchElementException:
                print(f"Error: Could not find 'blogMain' for {url}.")
                driver.quit()
                return ""
            
            # Navigate to <div class="b-section b-section--fixed-width">
            try:
                b_section = blog_main.find_element(By.CLASS_NAME, "b-section.b-section--fixed-width")
            except NoSuchElementException:
                print(f"Error: Could not find 'b-section b-section--fixed-width' for {url}.")
                driver.quit()
                return ""
            
            # Initialize list to store resume content
            resume_content = []
            
            # Find all direct children that are <p> or <ul> tags
            elements = b_section.find_elements(By.XPATH, "./*[self::p or self::ul]")
            
            for element in elements:
                tag_name = element.tag_name
                
                # Extract <p> (includes content from child tags like <strong>, <span>)
                if tag_name == "p":
                    text = clean_text(element.text)
                    if text:
                        resume_content.append(text)
                
                # Extract from <ul> (includes <li> and their child content)
                elif tag_name == "ul":
                    try:
                        li_items = element.find_elements(By.TAG_NAME, "li")
                        for li in li_items:
                            text = clean_text(li.text)
                            if text:
                                resume_content.append(text)
                    except NoSuchElementException:
                        pass
            
            # Join all content with a single space
            full_content = " ".join(resume_content)
            driver.quit()
            return full_content
        
        except WebDriverException as e:
            print(f"Error on attempt {attempt + 1} for {url}: {str(e)}")
            if attempt == max_retries:
                print(f"Error: Failed to crawl {url} after {max_retries + 1} attempts.")
                if driver:
                    driver.quit()
                return ""
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
            if attempt == max_retries:
                print(f"Error: Failed to crawl {url} after {max_retries + 1} attempts.")
                if driver:
                    driver.quit()
                return ""
        finally:
            if driver:
                try:
                    driver.quit()
                except:
                    pass

# Function to process a single row (for parallel execution)
def process_row(row):
    category = row["Category"]
    url = row["Resume_link"]
    print(f"Crawling resume for {category}...")
    resume_content = crawl_resume_content(url, category, max_retries=1)
    return {"Category": category, "Resume": resume_content}

In [7]:
# Use ThreadPoolExecutor for parallel crawling (2 workers for stability)
with ThreadPoolExecutor(max_workers=2) as executor:
    # Submit all tasks
    future_to_row = {executor.submit(process_row, row): row for _, row in df.iterrows()}
    
    # Collect results as they complete
    for future in as_completed(future_to_row):
        result = future.result()
        results.append(result)
        # Small delay to avoid overwhelming the server
        time.sleep(0.5)

Crawling resume for Data Scientist...Crawling resume for IT...

Info: Successfully loaded https://resumelab.com/resume-examples/it on attempt 1
Info: Successfully loaded https://resumelab.com/resume-examples/data-scientist on attempt 1
Info: Blog main wrapper detected for https://resumelab.com/resume-examples/data-scientist
Info: Blog main wrapper detected for https://resumelab.com/resume-examples/it
Crawling resume for UX Designer...
Crawling resume for .NET Developer...
Info: Successfully loaded https://resumelab.com/resume-examples/ui-ux on attempt 1
Info: Blog main wrapper detected for https://resumelab.com/resume-examples/ui-ux
Info: Successfully loaded https://resumelab.com/resume-examples/net-developer on attempt 1
Info: Blog main wrapper detected for https://resumelab.com/resume-examples/net-developer
Crawling resume for Android Developer...
Crawling resume for Animator...
Info: Successfully loaded https://resumelab.com/resume-examples/animator on attempt 1
Info: Blog main wrap

In [9]:
# Create a DataFrame with the results
result_df = pd.DataFrame(results)
# Ensure the DataFrame has the correct columns in the right order
result_df = result_df[["Category", "Resume"]]

In [10]:
result_df.head(5)

Unnamed: 0,Category,Resume
0,Data Scientist,"Anne Lounsberry Data Scientist, Microsoft Cert..."
1,IT,Kevin Park IT Specialist (218) 544-3609 kevin....
2,UX Designer,Pete Rybarski UX Designer & UI Developer Perso...
3,.NET Developer,Doreen J. Terry .NET Developer Personal Info 2...
4,Animator,Scott Phillips Animator Personal Info Phone: 6...


In [11]:
# Save to Excel
result_df.to_excel("D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/NewITData/finalresumelabITlink.xlsx", index=False)
print("Crawling complete. Results saved to Crawled_Resumes.xlsx")

Crawling complete. Results saved to Crawled_Resumes.xlsx
