In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
from openpyxl import Workbook
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# Optional: Install selenium-stealth if needed (uncomment to use)
# pip install selenium-stealth
try:
    from selenium_stealth import stealth
except ImportError:
    stealth = None
    print("Warning: selenium-stealth not installed. Some anti-bot measures may still block the script.")

In [3]:
# Read the input Excel file
df = pd.read_excel("D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/resumetrickITlink.xlsx")

In [4]:
# Initialize list to store all results
all_results = []

In [5]:
# Function to clean text: remove newlines, extra spaces, and special characters
def clean_text(text):
    # Remove newlines, tabs, and special characters
    text = re.sub(r'[\n\r\t]+', ' ', text)
    # Remove non-breaking spaces and other special characters
    text = re.sub(r'\xa0', ' ', text)
    # Collapse multiple spaces into a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing spaces
    return text.strip()

In [6]:
# Function to set up WebDriver with anti-bot measures
def setup_driver(max_retries=2):
    for attempt in range(max_retries + 1):
        try:
            chrome_options = Options()
            # Use real browser (non-headless mode) with visibility
            chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
            chrome_options.add_argument("--disable-blink-features=AutomationControlled")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-extensions")
            chrome_options.add_argument("--disable-infobars")
            
            # Set up the driver with the latest ChromeDriver version
            driver = webdriver.Chrome(
                service=Service(ChromeDriverManager().install()),
                options=chrome_options
            )
            
            # Apply stealth settings if available
            if stealth:
                stealth(driver,
                        languages=["en-US", "en"],
                        vendor="Google Inc.",
                        platform="Win32",
                        webgl_vendor="Intel Inc.",
                        renderer="Intel Iris OpenGL Engine",
                        fix_hairline=True,
                )
            
            print(f"Info: WebDriver initialized successfully on attempt {attempt + 1}")
            return driver
        
        except WebDriverException as e:
            print(f"Error initializing WebDriver on attempt {attempt + 1}: {str(e)}")
            if attempt == max_retries:
                raise Exception(f"Failed to initialize WebDriver after {max_retries + 1} attempts: {str(e)}")
            time.sleep(2)  # Wait before retrying

# Function to handle pop-ups (e.g., cookie banners)
def handle_popups(driver, url):
    try:
        # Look for common cookie banner buttons
        accept_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept') or contains(text(), 'Agree') or contains(text(), 'OK')]"))
        )
        accept_button.click()
        print(f"Info: Closed cookie banner for {url}")
        time.sleep(1)  # Wait for the banner to close
    except (NoSuchElementException, TimeoutException):
        print(f"Info: No cookie banner found for {url}")
    except Exception as e:
        print(f"Warning: Error closing pop-up for {url}: {str(e)}")

# Function to crawl resume content from the specified structure
def crawl_resume_content(url, category, max_retries=1):
    for attempt in range(max_retries + 1):
        driver = None
        try:
            # Set up WebDriver
            driver = setup_driver(max_retries=2)
            
            # Navigate to the URL
            driver.get(url)
            print(f"Info: Successfully loaded {url} on attempt {attempt + 1}")
            
            # Wait for the main wrapper to load (timeout set to 20 seconds)
            try:
                post_grid = WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "post-grid.flex"))
                )
                print(f"Info: Post grid detected for {url}")
            except TimeoutException:
                # Check for CAPTCHA or anti-bot page
                try:
                    captcha = driver.find_element(By.XPATH, "//*[contains(text(), 'CAPTCHA') or contains(text(), 'verify') or contains(text(), 'robot')]")
                    print(f"Warning: CAPTCHA detected for {url}. Please solve the CAPTCHA manually in the browser window.")
                    time.sleep(30)  # Pause to allow manual CAPTCHA solving
                    # Re-check for content after CAPTCHA solving
                    post_grid = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "post-grid.flex"))
                    )
                    print(f"Info: Post grid detected after CAPTCHA solving for {url}")
                except (NoSuchElementException, TimeoutException):
                    print(f"Error: Timeout waiting for post grid for {url} after {attempt + 1} attempts.")
                    if attempt == max_retries:
                        driver.quit()
                        return []
                    continue
            
            # Handle pop-ups (e.g., cookie banners)
            handle_popups(driver, url)
            
            # Navigate to <article class="post-content">
            try:
                post_content = post_grid.find_element(By.CLASS_NAME, "post-content")
            except NoSuchElementException:
                print(f"Error: Could not find 'post-content' for {url}.")
                driver.quit()
                return []
            
            # Navigate to <blockquote class="blockquote">
            try:
                blockquote = post_content.find_element(By.CLASS_NAME, "blockquote")
            except NoSuchElementException:
                print(f"Error: Could not find 'blockquote' for {url}.")
                driver.quit()
                return []
            
            # Initialize list to store resume content
            resume_content = []
            
            # Find all direct children that are <div>, <p>, <h3>, or <ul> tags
            elements = blockquote.find_elements(By.XPATH, "./*[self::div or self::p or self::h3 or self::ul]")
            
            for element in elements:
                tag_name = element.tag_name
                
                # Extract <div> (includes content from child tags)
                if tag_name == "div":
                    text = clean_text(element.text)
                    if text:
                        resume_content.append(text)
                
                # Extract <p> (includes content from child tags like <strong>, <em>)
                elif tag_name == "p":
                    text = clean_text(element.text)
                    if text:
                        resume_content.append(text)
                
                # Extract <h3> (includes content from child tags)
                elif tag_name == "h3":
                    text = clean_text(element.text)
                    if text:
                        resume_content.append(text)
                
                # Extract from <ul> (includes <li> and their child content)
                elif tag_name == "ul":
                    try:
                        li_items = element.find_elements(By.TAG_NAME, "li")
                        for li in li_items:
                            text = clean_text(li.text)
                            if text:
                                resume_content.append(text)
                    except NoSuchElementException:
                        pass
            
            # Join all content with a single space
            full_content = " ".join(resume_content)
            
            # Since ResumeTrick seems to have only one CV per page, return a single entry
            driver.quit()
            if full_content:
                return [{"Category": category, "Resume": full_content}]
            return []
        
        except WebDriverException as e:
            print(f"Error on attempt {attempt + 1} for {url}: {str(e)}")
            if attempt == max_retries:
                print(f"Error: Failed to crawl {url} after {max_retries + 1} attempts.")
                if driver:
                    driver.quit()
                return []
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
            if attempt == max_retries:
                print(f"Error: Failed to crawl {url} after {max_retries + 1} attempts.")
                if driver:
                    driver.quit()
                return []
        finally:
            if driver:
                try:
                    driver.quit()
                except:
                    pass

# Function to process a single row (for parallel execution)
def process_row(row):
    category = row["Category"]
    url = row["Resume_link"]
    print(f"Crawling resume for {category}...")
    resume_contents = crawl_resume_content(url, category, max_retries=1)
    return resume_contents

In [7]:
# Use ThreadPoolExecutor for parallel crawling (2 workers for stability)
with ThreadPoolExecutor(max_workers=2) as executor:
    # Submit all tasks
    future_to_row = {executor.submit(process_row, row): row for _, row in df.iterrows()}
    
    # Collect results as they complete
    for future in as_completed(future_to_row):
        results = future.result()
        all_results.extend(results)
        # Small delay to avoid overwhelming the server
        time.sleep(0.5)

Crawling resume for Admin assistant...Crawling resume for Azure Data Engineer...

Info: WebDriver initialized successfully on attempt 1Info: WebDriver initialized successfully on attempt 1

Info: Successfully loaded https://resumetrick.com/blog/azure-data-engineer-resume-examples.html on attempt 1
Info: Successfully loaded https://resumetrick.com/blog/admin-assistant-resume-examples.html on attempt 1
Info: Post grid detected for https://resumetrick.com/blog/azure-data-engineer-resume-examples.html
Info: Post grid detected for https://resumetrick.com/blog/admin-assistant-resume-examples.html
Info: No cookie banner found for https://resumetrick.com/blog/admin-assistant-resume-examples.html
Info: No cookie banner found for https://resumetrick.com/blog/azure-data-engineer-resume-examples.html
Crawling resume for Cloud Engineer...
Crawling resume for Computer Scientist...
Info: WebDriver initialized successfully on attempt 1
Info: Successfully loaded https://resumetrick.com/blog/computer-sc

In [8]:
# Create a DataFrame with the results
result_df = pd.DataFrame(all_results)

# Ensure the DataFrame has the correct columns in the right order
result_df = result_df[["Category", "Resume"]]

In [9]:
result_df.head(5)

Unnamed: 0,Category,Resume
0,Admin assistant,"Linda Spencer New York, NY Email: linda.spence..."
1,Azure Data Engineer,"Sarah Thompson Chicago, IL Email: sarah.thomps..."
2,Computer Scientist,Objective Aspiring student with a strong acade...
3,Cloud Engineer,"John Doel San Francisco, CA Email: john.doel@g..."
4,Cyber Security,Objective: Motivated and detail-oriented recen...


In [10]:
# Save to Excel
result_df.to_excel("D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/NewITData/finalresumetrickITlink.xlsx", index=False)
print("Crawling complete. Results saved to Crawled_Resumes.xlsx")

Crawling complete. Results saved to Crawled_Resumes.xlsx
