In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
from openpyxl import Workbook
from selenium.common.exceptions import NoSuchElementException

In [2]:
# Read the input Excel file
input_file = "D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/ResumeIOLinkIT.xlsx"
df = pd.read_excel(input_file)

In [3]:
# Initialize lists to store results
categories = []
resumes = []

In [4]:
# Function to clean text: remove newlines, extra spaces, and special characters
def clean_text(text):
    # Remove newlines, tabs, and special characters
    text = re.sub(r'[\n\r\t]+', ' ', text)
    # Remove non-breaking spaces and other special characters
    text = re.sub(r'\xa0', ' ', text)
    # Collapse multiple spaces into a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing spaces
    return text.strip()

In [5]:
# Function to crawl resume content from a single URL
def crawl_resume_content(url, category):
    try:
        # Set up Selenium WebDriver
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        # Navigate to the URL
        driver.get(url)
        time.sleep(3)  # Wait for page to load
        
        # Navigate the HTML structure as specified
        try:
            blog_post = driver.find_element(By.CLASS_NAME, "blog-post--example")
            content = blog_post.find_element(By.CLASS_NAME, "blog-post__content")
            content_inner = content.find_element(By.CLASS_NAME, "blog-post__content-inner")
            content_main = content_inner.find_element(By.CLASS_NAME, "blog-post__content-main")
            content_main_wrapper = content_main.find_element(By.CLASS_NAME, "blog-post__content-main-wrapper")
            container = content_main_wrapper.find_element(By.CLASS_NAME, "text-format_example__container")
            entire_cv = container.find_element(By.ID, "text-format_example_entire_cv")
        except NoSuchElementException as e:
            print(f"Error: Could not navigate HTML structure for {url}: {str(e)}")
            driver.quit()
            return ""
        
        # Initialize list to store resume content
        resume_content = []
        
        # Get all section containers
        section_containers = entire_cv.find_elements(By.CLASS_NAME, "text-format_example__section_container")
        
        for idx, section in enumerate(section_containers):
            # Handle the first section (Name, Title, Contact Info)
            if idx == 0:
                try:
                    # Get section titles (Name and Job Title)
                    titles = section.find_elements(By.CLASS_NAME, "text-format_example__section_title")
                    for title in titles:
                        resume_content.append(clean_text(title.text))
                    
                    # Get contact info
                    description = section.find_element(By.CLASS_NAME, "text-format_example__section_description")
                    resume_content.append(clean_text(description.text))
                except NoSuchElementException:
                    print(f"Warning: Could not find name/title/contact info for {url}")
                    continue
            
            # Handle Profile section
            elif idx == 1:
                try:
                    title = section.find_element(By.CLASS_NAME, "text-format_example__section_title")
                    resume_content.append(clean_text(title.text))
                    
                    description = section.find_element(By.CLASS_NAME, "text-format_example__section_description")
                    profile_content = ""
                    try:
                        # Try to find <p> tag first
                        paragraph = description.find_element(By.TAG_NAME, "p")
                        profile_content = clean_text(paragraph.text)
                    except NoSuchElementException:
                        try:
                            # If <p> not found, try <div> tag
                            div = description.find_element(By.TAG_NAME, "div")
                            profile_content = clean_text(div.text)
                        except NoSuchElementException:
                            print(f"Warning: No profile content (p or div) found for {url}")
                    if profile_content:
                        resume_content.append(profile_content)
                except NoSuchElementException:
                    print(f"Warning: Could not find profile section for {url}")
                    continue
            
            # Handle Work Experience section
            elif idx == 2:
                try:
                    title = section.find_element(By.CLASS_NAME, "text-format_example__section_title")
                    resume_content.append(clean_text(title.text))
                    
                    description = section.find_element(By.CLASS_NAME, "text-format_example__section_description")
                    
                    # Get all date paragraphs and their corresponding lists
                    date_paragraphs = description.find_elements(By.CLASS_NAME, "text-format_example__dates")
                    ul_elements = description.find_elements(By.TAG_NAME, "ul")
                    
                    # Process each date and its corresponding list
                    for i, date_p in enumerate(date_paragraphs):
                        resume_content.append(clean_text(date_p.text))
                        if i < len(ul_elements):
                            try:
                                list_items = ul_elements[i].find_elements(By.TAG_NAME, "li")
                                for li in list_items:
                                    resume_content.append(clean_text(li.text))
                            except NoSuchElementException:
                                print(f"Warning: No list items found for work experience {i+1} in {url}")
                except NoSuchElementException:
                    print(f"Warning: Could not find work experience section for {url}")
                    continue
            
            # Handle Education section
            elif idx == 3:
                try:
                    title = section.find_element(By.CLASS_NAME, "text-format_example__section_title")
                    resume_content.append(clean_text(title.text))
                    
                    description = section.find_element(By.CLASS_NAME, "text-format_example__section_description")
                    date_paragraphs = description.find_elements(By.CLASS_NAME, "text-format_example__dates")
                    for date_p in date_paragraphs:
                        resume_content.append(clean_text(date_p.text))
                except NoSuchElementException:
                    print(f"Warning: Could not find education section for {url}")
                    continue
            
            # Handle Languages and Skills section
            elif idx == 4:
                try:
                    titles = section.find_elements(By.CLASS_NAME, "text-format_example__section_title")
                    ul_elements = section.find_elements(By.TAG_NAME, "ul")
                    
                    # Process Languages (if present)
                    if len(titles) > 0:
                        resume_content.append(clean_text(titles[0].text))  # Languages
                        if len(ul_elements) > 0:
                            list_items = ul_elements[0].find_elements(By.TAG_NAME, "li")
                            for li in list_items:
                                resume_content.append(clean_text(li.text))
                    
                    # Process Skills (if present)
                    if len(titles) > 1:
                        resume_content.append(clean_text(titles[1].text))  # Skills
                        if len(ul_elements) > 1:
                            list_items = ul_elements[1].find_elements(By.TAG_NAME, "li")
                            for li in list_items:
                                resume_content.append(clean_text(li.text))
                except NoSuchElementException:
                    print(f"Warning: Could not find languages/skills section for {url}")
                    continue
        
        driver.quit()
        
        # Join all content with a single space
        full_content = " ".join(resume_content)
        return full_content
    except Exception as e:
        print(f"Error crawling {url}: {str(e)}")
        return ""

In [6]:
# Iterate through each row in the input file
for index, row in df.iterrows():
    category = row["Category"]
    url = row["Resume_link"]
    print(f"Crawling resume for {category}...")
    
    # Crawl resume content
    resume_content = crawl_resume_content(url, category)
    
    # Append to results
    categories.append(category)
    resumes.append(resume_content)
    
    # Optional: Small delay to avoid overwhelming the server
    time.sleep(1)

# Create a new DataFrame with the results
result_df = pd.DataFrame({
    "Category": categories,
    "Resume": resumes
})


Crawling resume for Web Developer...
Crawling resume for Software Developer...
Crawling resume for Programmer...
Crawling resume for Data Analyst ...
Crawling resume for Data Scientist...
Crawling resume for IT Manager...
Crawling resume for Software Engineer...
Crawling resume for SOC Analyst...
Crawling resume for Network Systems Analyst ...
Crawling resume for 3D Animator...
Crawling resume for Film and Video Editor...
Crawling resume for Computer Science...
Crawling resume for Google...
Crawling resume for Motion Graphics Artist...
Crawling resume for IT Project Manager...
Crawling resume for IT Director...
Crawling resume for Prompt Engineer...
Crawling resume for AI Engineer...
Crawling resume for DevOps Engineer...
Crawling resume for Network Engineer...
Crawling resume for Information Technology...
Crawling resume for Technical Project Manager...
Crawling resume for Scrum Master...
Crawling resume for Systems Analyst...
Crawling resume for Senior Software Engineer...
Crawling r

In [7]:
df_out=pd.DataFrame(result_df)

In [8]:
df_out.head(5)

Unnamed: 0,Category,Resume
0,Web Developer,Shane Gomez Web Developer shne.gmez127@gmail.c...
1,Software Developer,Jim Ryan Software Developer jimm_ryyn89@gmail....
2,Programmer,Taylor Cook Programmer tylor_cookk458@gmail.co...
3,Data Analyst,Barry Stevens Data Analyst bst_vensxc_88@gmail...
4,Data Scientist,Timothy Smith Data Scientist smit_htimthy_11@g...


In [9]:
# Save to Excel
result_df.to_excel("D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/NewITData/finalresumeiolink.xlsx", index=False)
print("Crawling complete. Results saved to Crawled_Resumes.xlsx")

Crawling complete. Results saved to Crawled_Resumes.xlsx
