In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
from openpyxl import Workbook
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [2]:
# Optional: Install selenium-stealth if needed (uncomment to use)
# pip install selenium-stealth
try:
    from selenium_stealth import stealth
except ImportError:
    stealth = None
    print("Warning: selenium-stealth not installed. Some anti-bot measures may still block the script.")

In [3]:
# Read the input Excel file
input_file = "D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/zetyITLink.xlsx"
df = pd.read_excel(input_file)

In [4]:
# Initialize lists to store results
categories = []
resumes = []

In [5]:
# Function to clean text: remove newlines, extra spaces, and special characters
def clean_text(text):
    # Remove newlines, tabs, and special characters
    text = re.sub(r'[\n\r\t]+', ' ', text)
    # Remove non-breaking spaces and other special characters
    text = re.sub(r'\xa0', ' ', text)
    # Collapse multiple spaces into a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing spaces
    return text.strip()

In [6]:
# Function to crawl resume content from a single URL
def crawl_resume_content(url, category):
    try:
        # Set up Selenium WebDriver
        chrome_options = Options()
        # Comment out headless mode to allow manual CAPTCHA solving if needed
        # chrome_options.add_argument("--headless")  # Disabled headless mode
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_argument("--disable-gpu")  # Sometimes helps with rendering
        
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        # Apply stealth settings if selenium-stealth is available
        if stealth:
            stealth(driver,
                    languages=["en-US", "en"],
                    vendor="Google Inc.",
                    platform="Win32",
                    webgl_vendor="Intel Inc.",
                    renderer="Intel Iris OpenGL Engine",
                    fix_hairline=True,
            )
        
        # Navigate to the URL
        driver.get(url)
        
        # Scroll to trigger dynamic content loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait for resume content to load (p, h3, or ul tags)
        try:
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, "//p | //h3 | //ul"))
            )
            print(f"Info: Resume content detected for {url}")
        except TimeoutException:
            # Check for CAPTCHA or anti-bot page
            try:
                captcha = driver.find_element(By.XPATH, "//*[contains(text(), 'CAPTCHA') or contains(text(), 'verify') or contains(text(), 'robot')]")
                print(f"Warning: CAPTCHA detected for {url}. Please solve the CAPTCHA manually in the browser window.")
                time.sleep(30)  # Pause to allow manual CAPTCHA solving
                # Re-check for content after CAPTCHA solving
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, "//p | //h3 | //ul"))
                )
                print(f"Info: Resume content detected after CAPTCHA solving for {url}")
            except (NoSuchElementException, TimeoutException):
                # Save screenshot, page source, and console logs for debugging
                driver.save_screenshot(f"screenshot_{category}.png")
                with open(f"page_source_{category}.html", "w", encoding="utf-8") as f:
                    f.write(driver.page_source)
                # Log JavaScript console errors
                console_logs = driver.get_log("browser")
                with open(f"console_logs_{category}.txt", "w", encoding="utf-8") as f:
                    for log in console_logs:
                        f.write(str(log) + "\n")
                print(f"Error: Timeout waiting for resume content for {url}. Screenshot, page source, and console logs saved.")
                driver.quit()
                return ""
        
        # Try primary navigation path
        section = None
        try:
            blog_main_wrapper = driver.find_element(By.CLASS_NAME, "blog-main__wrapper")
            blog_main = blog_main_wrapper.find_element(By.CLASS_NAME, "blog-main--en")
            section = blog_main.find_element(By.CLASS_NAME, "b-section--fixed-width")
            print(f"Info: Used primary navigation path for {url}")
        except NoSuchElementException:
            # Fallback 1: Try finding b-section--fixed-width directly
            try:
                section = driver.find_element(By.CLASS_NAME, "b-section--fixed-width")
                print(f"Warning: Used fallback to b-section--fixed-width for {url}")
            except NoSuchElementException:
                # Fallback 2: Try finding a generic article or section
                try:
                    section = driver.find_element(By.XPATH, "//article | //section[contains(@class, 'blog') or contains(@class, 'content')]")
                    print(f"Warning: Used generic article/section fallback for {url}")
                except NoSuchElementException:
                    # Fallback 3: Try finding a div with common resume-like classes
                    try:
                        section = driver.find_element(By.XPATH, "//div[contains(@class, 'post') or contains(@class, 'content') or contains(@class, 'resume') or contains(@class, 'main')]")
                        print(f"Warning: Used div fallback (post/content/resume/main) for {url}")
                    except NoSuchElementException:
                        # Fallback 4: Try finding any container with resume-like content
                        try:
                            section = driver.find_element(By.XPATH, "//div[.//p or .//h3 or .//ul][contains(@class, 'section') or contains(@class, 'resume') or contains(@class, 'content')]")
                            print(f"Warning: Used generic resume-like container fallback for {url}")
                        except NoSuchElementException:
                            # Fallback 5: Broad search for any resume-like content
                            try:
                                section = driver.find_element(By.XPATH, "//*[.//p | .//h3 | .//ul]")
                                print(f"Warning: Used broad resume-like content fallback for {url}")
                            except NoSuchElementException:
                                # Save debugging info
                                driver.save_screenshot(f"screenshot_{category}.png")
                                with open(f"page_source_{category}.html", "w", encoding="utf-8") as f:
                                    f.write(driver.page_source)
                                console_logs = driver.get_log("browser")
                                with open(f"console_logs_{category}.txt", "w", encoding="utf-8") as f:
                                    for log in console_logs:
                                        f.write(str(log) + "\n")
                                print(f"Error: Could not find any suitable container for {url}. Screenshot, page source, and console logs saved.")
                                driver.quit()
                                return ""
        
        # Initialize list to store resume content
        resume_content = []
        
        # Get all direct child elements (p, h3, ul) in the section
        elements = section.find_elements(By.XPATH, "./*[self::p or self::h3 or self::ul]")
        
        for element in elements:
            tag_name = element.tag_name
            
            # Handle <p> tags (Dạng 2 prioritizes <strong> or <em>)
            if tag_name == "p":
                try:
                    # Check for <strong> tag (Dạng 2)
                    try:
                        strong = element.find_element(By.TAG_NAME, "strong")
                        text = clean_text(strong.text)
                    except NoSuchElementException:
                        # Check for <em> tag (Dạng 2)
                        try:
                            em = element.find_element(By.TAG_NAME, "em")
                            text = clean_text(em.text)
                        except NoSuchElementException:
                            # Use <p> text directly
                            text = clean_text(element.text)
                    if text:
                        resume_content.append(text)
                except NoSuchElementException:
                    continue
            
            # Handle <h3> tags (Dạng 1 prioritizes <strong>)
            elif tag_name == "h3":
                try:
                    # Check for <strong> tag (Dạng 1)
                    try:
                        strong = element.find_element(By.TAG_NAME, "strong")
                        text = clean_text(strong.text)
                    except NoSuchElementException:
                        # Use <h3> text directly
                        text = clean_text(element.text)
                    if text:
                        resume_content.append(text)
                except NoSuchElementException:
                    continue
            
            # Handle <ul> tags (lists)
            elif tag_name == "ul":
                try:
                    list_items = element.find_elements(By.TAG_NAME, "li")
                    for li in list_items:
                        try:
                            # Always extract the full text of the <li> tag, including <span> and text outside it
                            text = clean_text(li.text)
                            if text:
                                resume_content.append(text)
                        except NoSuchElementException:
                            continue
                except NoSuchElementException:
                    continue
        
        driver.quit()
        
        # Join all content with a single space
        full_content = " ".join(resume_content)
        return full_content
    except Exception as e:
        print(f"Error crawling {url}: {str(e)}")
        return ""

In [7]:
# Iterate through each row in the input file
for index, row in df.iterrows():
    category = row["Category"]
    url = row["Resume_link"]
    print(f"Crawling resume for {category}...")
    
    # Crawl resume content
    resume_content = crawl_resume_content(url, category)
    
    # Append to results
    categories.append(category)
    resumes.append(resume_content)
    
    # Optional: Small delay to avoid overwhelming the server
    time.sleep(1)

# Create a new DataFrame with the results
result_df = pd.DataFrame({
    "Category": categories,
    "Resume": resumes
})

Crawling resume for .NET Developer...
Info: Resume content detected for https://zety.com/blog/net-developer-resume-example
Info: Used primary navigation path for https://zety.com/blog/net-developer-resume-example
Crawling resume for 3D Artist...
Info: Resume content detected for https://zety.com/blog/3d-artist-resume-example
Info: Used primary navigation path for https://zety.com/blog/3d-artist-resume-example
Crawling resume for Active Directory...
Info: Resume content detected for https://zety.com/blog/active-directory-resume-example
Info: Used primary navigation path for https://zety.com/blog/active-directory-resume-example
Crawling resume for Agile Coach...
Info: Resume content detected for https://zety.com/blog/agile-coach-resume-example
Info: Used primary navigation path for https://zety.com/blog/agile-coach-resume-example
Crawling resume for Agile Project Manager...
Info: Resume content detected for https://zety.com/blog/agile-project-manager-resume-example
Info: Used primary nav

In [8]:
df_out=pd.DataFrame(result_df)

In [9]:
df_out.head(5)

Unnamed: 0,Category,Resume
0,.NET Developer,Jane Fisher jane.q.fisher@gmail.com 757-646-16...
1,3D Artist,Jimmie D. Brooke 3D Artist jimmiedebrooke@mail...
2,Active Directory,Wesley S. Leasure Active Directory Administrat...
3,Agile Coach,Alexander J. Stinson Agile Coach 325-365-6272 ...
4,Agile Project Manager,Derrick Murrieta Agile Project Manager 330-542...


In [10]:
# Save to Excel
result_df.to_excel("D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/NewITData/finalzetyitlinklink.xlsx", index=False)
print("Crawling complete. Results saved to Crawled_Resumes.xlsx")

Crawling complete. Results saved to Crawled_Resumes.xlsx
