In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
from openpyxl import Workbook
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException, StaleElementReferenceException
from concurrent.futures import ThreadPoolExecutor, as_completed


In [2]:
# Optional: Install selenium-stealth if needed (uncomment to use)
# pip install selenium-stealth
try:
    from selenium_stealth import stealth
except ImportError:
    stealth = None
    print("Warning: selenium-stealth not installed. Some anti-bot measures may still block the script.")

In [3]:
# Read the input Excel file
input_file = "D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/resumekraftITlink.xlsx"
df = pd.read_excel(input_file)

In [4]:
# Function to clean text: remove newlines, extra spaces, and special characters
def clean_text(text):
    # Remove newlines, tabs, and special characters
    text = re.sub(r'[\n\r\t]+', ' ', text)
    # Remove non-breaking spaces and other special characters
    text = re.sub(r'\xa0', ' ', text)
    # Collapse multiple spaces into a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing spaces
    return text.strip()

In [5]:
# Function to set up WebDriver with enhanced anti-bot measures
def setup_driver():
    chrome_options = Options()
    # Run in non-headless mode for better JavaScript execution
    # chrome_options.add_argument("--headless")  # Disabled headless mode
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    
    # Set up the driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    # Apply stealth settings if available
    if stealth:
        stealth(driver,
                languages=["en-US", "en"],
                vendor="Google Inc.",
                platform="Win32",
                webgl_vendor="Intel Inc.",
                renderer="Intel Iris OpenGL Engine",
                fix_hairline=True,
        )
    
    return driver


In [6]:
# Function to crawl Dạng 1 content with improved stale element handling
def crawl_dang_1(resume_format, resume_content, driver):
    # Extract from <div class="r-section-head">
    try:
        section_head = WebDriverWait(resume_format, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, "r-section-head"))
        )
        # Extract <h3>, <h4>, and <p> if present
        try:
            h3 = section_head.find_element(By.TAG_NAME, "h3")
            text = clean_text(h3.text)
            if text:
                resume_content.append(text)
        except NoSuchElementException:
            pass
        
        try:
            h4 = section_head.find_element(By.TAG_NAME, "h4")
            text = clean_text(h4.text)
            if text:
                resume_content.append(text)
        except NoSuchElementException:
            pass
        
        try:
            p = section_head.find_element(By.TAG_NAME, "p")
            text = clean_text(p.text)
            if text:
                resume_content.append(text)
        except NoSuchElementException:
            pass
    except (NoSuchElementException, TimeoutException):
        print("Warning: 'r-section-head' not found.")
    
    # Find all <div class="r-section"> dynamically to avoid stale elements
    def get_sections():
        return WebDriverWait(resume_format, 5).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "r-section"))
        )
    
    max_retries = 2
    for attempt in range(max_retries + 1):
        try:
            sections = get_sections()
            for section_idx in range(len(sections)):
                # Re-fetch sections to ensure freshness
                sections = get_sections()
                section = sections[section_idx]
                
                # Extract <h3> from each section
                try:
                    h3 = section.find_element(By.TAG_NAME, "h3")
                    h3_text = clean_text(h3.text)
                    if h3_text:
                        resume_content.append(h3_text)
                except (NoSuchElementException, StaleElementReferenceException):
                    continue
                
                # Check for <div class="r-sec-data">
                try:
                    sec_data = section.find_element(By.CLASS_NAME, "r-sec-data")
                except (NoSuchElementException, StaleElementReferenceException):
                    continue
                
                # Extract <p> from <div class="r-sec-data">
                try:
                    p_tags = sec_data.find_elements(By.TAG_NAME, "p")
                    for p in p_tags:
                        text = clean_text(p.text)
                        if text:
                            resume_content.append(text)
                except (NoSuchElementException, StaleElementReferenceException):
                    pass
                
                # Extract from <ul class="keySkills list-group"> (Skills section)
                try:
                    skills_ul = sec_data.find_element(By.CLASS_NAME, "keySkills.list-group")
                    skill_items = skills_ul.find_elements(By.CLASS_NAME, "list-group-item")
                    for li in skill_items:
                        text = clean_text(li.text)
                        if text:
                            resume_content.append(text)
                except (NoSuchElementException, StaleElementReferenceException):
                    pass
                
                # Extract from <ul class="keySkills"> (Languages section)
                try:
                    languages_ul = sec_data.find_element(By.CLASS_NAME, "keySkills")
                    language_items = languages_ul.find_elements(By.TAG_NAME, "li")
                    for li in language_items:
                        text = clean_text(li.text)
                        if text:
                            resume_content.append(text)
                except (NoSuchElementException, StaleElementReferenceException):
                    pass
                
                # Extract from <article> (Work Experience and Education sections)
                try:
                    articles = sec_data.find_elements(By.TAG_NAME, "article")
                    for article_idx in range(len(articles)):
                        # Re-fetch articles to ensure freshness
                        articles = sec_data.find_elements(By.TAG_NAME, "article")
                        article = articles[article_idx]
                        
                        # Extract <h4>
                        try:
                            h4 = article.find_element(By.TAG_NAME, "h4")
                            text = clean_text(h4.text)
                            if text:
                                resume_content.append(text)
                        except (NoSuchElementException, StaleElementReferenceException):
                            pass
                        
                        # Extract from <div class="eduCompany">
                        try:
                            edu_company = article.find_element(By.CLASS_NAME, "eduCompany")
                            # Extract <strong>
                            try:
                                strong = edu_company.find_element(By.TAG_NAME, "strong")
                                text = clean_text(strong.text)
                                if text:
                                    resume_content.append(text)
                            except (NoSuchElementException, StaleElementReferenceException):
                                pass
                            
                            # Extract <p>
                            try:
                                p = edu_company.find_element(By.TAG_NAME, "p")
                                text = clean_text(p.text)
                                if text:
                                    resume_content.append(text)
                            except (NoSuchElementException, StaleElementReferenceException):
                                pass
                            
                            # Extract from <div class="subDetails">
                            try:
                                sub_details = edu_company.find_element(By.CLASS_NAME, "subDetails")
                                span = sub_details.find_element(By.TAG_NAME, "span")
                                text = clean_text(span.text)
                                if text:
                                    resume_content.append(text)
                            except (NoSuchElementException, StaleElementReferenceException):
                                pass
                        except (NoSuchElementException, StaleElementReferenceException):
                            pass
                        
                        # Extract from all <div> tags that might contain <ul> and <li>
                        try:
                            divs = article.find_elements(By.TAG_NAME, "div")
                            for div_idx in range(len(divs)):
                                # Re-fetch divs to ensure freshness
                                divs = article.find_elements(By.TAG_NAME, "div")
                                div = divs[div_idx]
                                try:
                                    ul = div.find_element(By.TAG_NAME, "ul")
                                    li_items = ul.find_elements(By.TAG_NAME, "li")
                                    for li in li_items:
                                        text = clean_text(li.text)
                                        if text:
                                            resume_content.append(text)
                                except (NoSuchElementException, StaleElementReferenceException):
                                    continue
                        except (NoSuchElementException, StaleElementReferenceException):
                            pass
                except (NoSuchElementException, StaleElementReferenceException):
                    pass
            break  # Exit retry loop if successful
        except StaleElementReferenceException as e:
            if attempt == max_retries:
                print(f"Error: Failed to process sections after {max_retries + 1} attempts due to stale elements: {str(e)}")
            else:
                print(f"Stale element encountered on attempt {attempt + 1}, retrying...")

In [7]:
# Function to crawl Dạng 2 content with improved stale element handling
def crawl_dang_2(sample_resume_data, resume_content, driver):
    # Skip the first <h2> tag and get all other direct children dynamically
    def get_elements():
        return WebDriverWait(sample_resume_data, 5).until(
            EC.presence_of_all_elements_located((By.XPATH, "./*[self::p or self::h3 or self::ul or self::ol]"))
        )
    
    max_retries = 2
    for attempt in range(max_retries + 1):
        try:
            elements = get_elements()
            for element_idx in range(len(elements)):
                # Re-fetch elements to ensure freshness
                elements = get_elements()
                element = elements[element_idx]
                
                tag_name = element.tag_name
                
                # Extract <p> (includes <strong> and <a> content)
                if tag_name == "p":
                    text = clean_text(element.text)
                    if text:
                        resume_content.append(text)
                
                # Extract <h3> (includes <strong> content)
                elif tag_name == "h3":
                    text = clean_text(element.text)
                    if text:
                        resume_content.append(text)
                
                # Extract from <ul> (Skills, Experience, or <ul class="list-group">)
                elif tag_name == "ul":
                    try:
                        # Check if it's <ul class="list-group">
                        if "list-group" in element.get_attribute("class"):
                            li_items = element.find_elements(By.CLASS_NAME, "list-group-item")
                            for li in li_items:
                                text = clean_text(li.text)
                                if text:
                                    resume_content.append(text)
                        else:
                            # Regular <ul> with <li>
                            li_items = element.find_elements(By.TAG_NAME, "li")
                            for li in li_items:
                                text = clean_text(li.text)
                                if text:
                                    resume_content.append(text)
                    except (NoSuchElementException, StaleElementReferenceException):
                        pass
                
                # Extract from <ol> (Languages)
                elif tag_name == "ol":
                    li_items = element.find_elements(By.TAG_NAME, "li")
                    for li in li_items:
                        text = clean_text(li.text)
                        if text:
                            resume_content.append(text)
            break  # Exit retry loop if successful
        except StaleElementReferenceException as e:
            if attempt == max_retries:
                print(f"Error: Failed to process elements after {max_retries + 1} attempts due to stale elements: {str(e)}")
            else:
                print(f"Stale element encountered on attempt {attempt + 1}, retrying...")


In [8]:
# Function to crawl resume content for a single URL
def crawl_resume_content(url, category, max_retries=1):
    for attempt in range(max_retries + 1):
        driver = None
        try:
            # Set up WebDriver
            driver = setup_driver()
            
            # Navigate to the URL
            driver.get(url)
            print(f"Info: Successfully loaded {url} on attempt {attempt + 1}")
            
            # Wait for resume content to load (timeout set to 20 seconds)
            try:
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "sample-resume-data"))
                )
                print(f"Info: Resume content detected for {url}")
            except TimeoutException:
                # Check for CAPTCHA or anti-bot page
                try:
                    captcha = driver.find_element(By.XPATH, "//*[contains(text(), 'CAPTCHA') or contains(text(), 'verify') or contains(text(), 'robot')]")
                    print(f"Warning: CAPTCHA detected for {url}. Please solve the CAPTCHA manually in the browser window.")
                    time.sleep(30)  # Pause to allow manual CAPTCHA solving
                    # Re-check for content after CAPTCHA solving
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "sample-resume-data"))
                    )
                    print(f"Info: Resume content detected after CAPTCHA solving for {url}")
                except (NoSuchElementException, TimeoutException):
                    print(f"Error: Timeout waiting for resume content for {url} after {attempt + 1} attempts.")
                    if attempt == max_retries:
                        driver.quit()
                        return ""
                    continue
            
            # Initialize list to store resume content
            resume_content = []
            
            # Check for Dạng 1: <div class="sample-resume-data"> and <div class="sample-resume-data resume-format">
            try:
                main_container = driver.find_element(By.CLASS_NAME, "sample-resume-data")
                resume_format = WebDriverWait(main_container, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "sample-resume-data.resume-format"))
                )
                print(f"Info: Detected Dạng 1 for {url}")
                crawl_dang_1(resume_format, resume_content, driver)
            except (NoSuchElementException, TimeoutException):
                # Check for Dạng 2: <div class="site-content container"> and <div class="sample-resume-data">
                try:
                    site_content = driver.find_element(By.CLASS_NAME, "site-content.container")
                    sample_resume_data = WebDriverWait(site_content, 5).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "sample-resume-data"))
                    )
                    print(f"Info: Detected Dạng 2 for {url}")
                    crawl_dang_2(sample_resume_data, resume_content, driver)
                except (NoSuchElementException, TimeoutException):
                    print(f"Error: Could not find suitable container for {url} (neither Dạng 1 nor Dạng 2).")
                    driver.quit()
                    return ""
            
            # Join all content with a single space
            full_content = " ".join(resume_content)
            driver.quit()
            return full_content
        
        except WebDriverException as e:
            print(f"Error on attempt {attempt + 1} for {url}: {str(e)}")
            if attempt == max_retries:
                print(f"Error: Failed to crawl {url} after {max_retries + 1} attempts.")
                if driver:
                    driver.quit()
                return ""
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
            if attempt == max_retries:
                print(f"Error: Failed to crawl {url} after {max_retries + 1} attempts.")
                if driver:
                    driver.quit()
                return ""
        finally:
            if driver:
                try:
                    driver.quit()
                except:
                    pass

In [9]:
# Initialize lists to store results
categories = []
resumes = []
results = []

In [10]:
# Function to process a single row (for parallel execution)
def process_row(row):
    category = row["Category"]
    url = row["Resume_link"]
    print(f"Crawling resume for {category}...")
    resume_content = crawl_resume_content(url, category, max_retries=1)
    return {"Category": category, "Resume": resume_content}

In [11]:
# Use ThreadPoolExecutor for parallel crawling (reduced to 2 workers)
with ThreadPoolExecutor(max_workers=2) as executor:
    # Submit all tasks
    future_to_row = {executor.submit(process_row, row): row for _, row in df.iterrows()}
    
    # Collect results as they complete
    for future in as_completed(future_to_row):
        result = future.result()
        results.append(result)
        # Small delay to avoid overwhelming the server
        time.sleep(0.5)

Crawling resume for DevOps Engineer...Crawling resume for Back-End Developer...

Info: Successfully loaded https://resumekraft.com/back-end-developer-resume-sample-2/ on attempt 1
Info: Resume content detected for https://resumekraft.com/back-end-developer-resume-sample-2/
Info: Detected Dạng 1 for https://resumekraft.com/back-end-developer-resume-sample-2/
Info: Successfully loaded https://resumekraft.com/devops-engineer-resume-sample-2/ on attempt 1
Info: Resume content detected for https://resumekraft.com/devops-engineer-resume-sample-2/
Info: Detected Dạng 1 for https://resumekraft.com/devops-engineer-resume-sample-2/
Crawling resume for Front-End Developer...
Crawling resume for Full-Stack Developer...
Info: Successfully loaded https://resumekraft.com/full-stack-developer-resume-sample/ on attempt 1
Info: Resume content detected for https://resumekraft.com/full-stack-developer-resume-sample/
Info: Successfully loaded https://resumekraft.com/front-end-developer-resume-sample/ on at

In [16]:
result_df=pd.DataFrame(results)

In [17]:
result_df.head(5)

Unnamed: 0,Category,Resume
0,Back-End Developer,Victor Brandon Back-End Developer Summary High...
1,DevOps Engineer,Christa Nathan DevOps Engineer Summary Results...
2,Full-Stack Developer,James Lauren Full-Stack Developer Summary High...
3,Front-End Developer,James Marsh Front-End Developer Summary Highly...
4,IT Support Specialist,Eric Michael IT Support Specialist Summary Exp...


In [18]:
# Ensure the DataFrame has the correct columns in the right order
result_df = result_df[["Category", "Resume"]]

In [19]:
# Save to Excel
result_df.to_excel("D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/NewITData/finalresumekraftITlink.xlsx", index=False)
print("Crawling complete. Results saved to Crawled_Resumes.xlsx")

Crawling complete. Results saved to Crawled_Resumes.xlsx
