In [1]:
import csv
import time
import os
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import unicodedata
from textblob import TextBlob

In [2]:
# Khởi tạo trình duyệt Chrome
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized") 
options.add_argument("--disable-blink-features=AutomationControlled")  
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

In [3]:
# Đường dẫn để lưu các file SVG
svg_output_dir = "D:\BaiDoAnChuyenNganh3\Automated-Resume-Ranking-System-main\svg_files"
if not os.path.exists(svg_output_dir):
    os.makedirs(svg_output_dir)

In [4]:
# Đọc danh sách link từ file CSV
input_csv = "D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/cv1_links_cleaned.csv"
output_csv = "D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/final_cv1links_resumes.csv"
df = pd.read_csv(input_csv)

In [5]:
data = []

In [6]:
# Danh sách tiêu đề phổ biến
COMMON_HEADINGS = [
    "SUMMARY", "PROFESSIONAL SUMMARY", "OBJECTIVE", "CORE QUALIFICATIONS",
    "QUALIFICATIONS", "SKILLS", "TECHNICAL SKILLS", "EDUCATION", "WORK EXPERIENCE",
    "EXPERIENCE", "EMPLOYMENT HISTORY", "CERTIFICATIONS", "AFFILIATIONS", "PROJECTS",
    "PORTFOLIO", "PROFESSION", "LANGUAGES"
]

# Danh sách từ chuyên ngành không cần sửa
PRESERVED_WORDS = {
    "QuickBooks", "SAP", "ERP", "HTML", "CSS", "JavaScript", "PHP", "Python", "Ruby",
    "Java", "SQL", "MySQL", "Git", "Django", "XML", "UX", "CPA", "AICPA", "NYSCPA",
    "IMA", "IABA", "HTM", "Goodwill", "Rochester", "Chektowaga", "Buffalo"
}

def clean_text(text):
    # Chuẩn hóa ký tự Unicode
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
    # Loại bỏ lặp lại ký tự liên tiếp (e.g., SSSS -> S)
    text = re.sub(r'(.)\1+', r'\1', text)
    # Loại bỏ khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text)
    
    # Tách từ bị dính liền trước khi sửa chính tả
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Tách "SumaryDetail" -> "Sumary Detail"
    
    # Sửa lỗi chính tả bằng TextBlob, nhưng bảo vệ từ chuyên ngành
    words = text.split()
    corrected_words = []
    for word in words:
        if word in PRESERVED_WORDS or any(word.upper().startswith(h) for h in COMMON_HEADINGS):
            corrected_words.append(word)  # Giữ nguyên từ chuyên ngành và tiêu đề
        else:
            blob = TextBlob(word)
            corrected_word = str(blob.correct())
            # Hậu xử lý: sửa lại các lỗi phổ biến mà TextBlob có thể sai
            if corrected_word.lower() == "of":  # Loại bỏ "of" thừa
                continue
            if corrected_word.lower() == "quickboks":
                corrected_word = "QuickBooks"
            corrected_words.append(corrected_word)
    
    text = " ".join(corrected_words)
    
    # Thêm khoảng cách sau dấu hai chấm nếu cần
    text = re.sub(r'(\w+):(\w+[a-zA-Z])', r'\1: \2', text)
    # Loại bỏ lặp lại từ trong toàn bộ văn bản
    text = re.sub(r'(\b\w+\b)(?:\s*\1)+', r'\1', text, flags=re.IGNORECASE)
    # Loại bỏ "of" thừa ở cuối dòng
    text = re.sub(r'\bof\b$', '', text, flags=re.IGNORECASE)
    return text.strip()

def group_text_elements(text_elements_with_coords, x_threshold=10, y_threshold=25):
    grouped_elements = []
    current_group = []
    last_x, last_y = None, None

    # Sắp xếp theo y trước, sau đó x
    text_elements_with_coords.sort(key=lambda k: (k[0], k[1]))

    for y, x, end_x, text in text_elements_with_coords:
        text = clean_text(text)
        is_heading = any(text.upper().startswith(heading) for heading in COMMON_HEADINGS)
        
        if not current_group:
            current_group.append((y, x, end_x, text))
            last_x, last_y = x, y
        else:
            # Tách dòng nếu là tiêu đề hoặc khoảng cách y lớn
            if is_heading or abs(y - last_y) >= y_threshold:
                if current_group:
                    grouped_elements.append(combine_group(current_group))
                current_group = [(y, x, end_x, text)]
            # Gộp nếu cùng dòng và khoảng cách x hợp lý
            elif abs(x - last_x) <= x_threshold:
                current_group.append((y, x, end_x, text))
            else:
                # Thêm khoảng trắng nếu cách xa nhưng cùng dòng
                if abs(x - last_x) > x_threshold and abs(y - last_y) < y_threshold:
                    current_group.append((y, last_x + 1, x - 1, " "))
                current_group.append((y, x, end_x, text))
            last_x, last_y = x, y

    if current_group:
        grouped_elements.append(combine_group(current_group))

    return grouped_elements

def combine_group(group):
    y = group[0][0]
    x = group[0][1]
    end_x = group[-1][2]
    text = "".join(item[3] for item in group)
    return (y, x, end_x, text)

def remove_duplicates(text):
    lines = text.split("\n")
    seen = set()
    unique_lines = []
    for line in lines:
        line = line.strip()
        if line and line not in seen:
            seen.add(line)
            unique_lines.append(line)
    return "\n".join(unique_lines)

def fetch_resume_content(url, idx):
    try:
        if not url.startswith(("http://", "https://")):
            url = "https://" + url

        print(f"🔍 Đang truy cập: {url}")
        driver.get(url)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "svg"))
        )

        resume_text = "N/A"
        resume_html_path = "N/A"

        if url.endswith(".svg"):
            try:
                print(f"📄 Đang lấy nội dung file SVG: {url}")
                svg_element = driver.find_element(By.TAG_NAME, "svg")
                svg_html = svg_element.get_attribute("outerHTML")
                svg_filename = f"resume_{idx}.svg"
                svg_filepath = os.path.join(svg_output_dir, svg_filename)
                with open(svg_filepath, "w", encoding="utf-8") as svg_file:
                    svg_file.write(svg_html)
                resume_html_path = svg_filepath

                soup = BeautifulSoup(svg_html, "xml")
                text_elements = soup.find_all("text")
                
                text_elements_with_coords = []
                seen_texts = set()
                for element in text_elements:
                    transform = element.get("transform", "")
                    match = re.search(r"matrix\(([-0-1.\s]+)\s+([-0-1.\s]+)\s+([-0-1.\s]+)\s+([-0-1.\s]+)\s+(-?\d+\.?\d*)\s+(-?\d+\.?\d*)\)", transform)
                    if match:
                        x = float(match.group(5))
                        y = float(match.group(6))
                    else:
                        x = float(element.get("x", 0))
                        y = float(element.get("y", 0))
                        tspans = element.find_all("tspan")
                        if tspans and tspans[0].get("x"):
                            x = float(tspans[0].get("x", x))
                        if tspans and tspans[0].get("y"):
                            y = float(tspans[0].get("y", y))

                    tspans = element.find_all("tspan")
                    if tspans:
                        text = "".join(tspan.get_text() for tspan in tspans)
                        last_tspan = tspans[-1]
                        tspan_x_values = last_tspan.get("x", "0").split(",")
                        tspan_x = float(tspan_x_values[-1]) if tspan_x_values else 0
                        end_x = x + tspan_x + len(last_tspan.get_text()) * 5
                    else:
                        text = element.get_text()
                        end_x = x + len(text) * 5

                    if text and not re.search(r"@font-face|base64|font-family", text, re.IGNORECASE):
                        text_key = (round(y, 1), round(x, 1), text)
                        if text_key not in seen_texts:
                            seen_texts.add(text_key)
                            text_elements_with_coords.append((y, x, end_x, text))

                # Gộp các ký tự gần nhau
                grouped_elements = group_text_elements(text_elements_with_coords, x_threshold=10, y_threshold=25)

                resume_lines = []
                current_line = []
                current_y = None

                for y, x, end_x, text in grouped_elements:
                    text = clean_text(text)
                    is_heading = any(text.upper().startswith(heading) for heading in COMMON_HEADINGS)
                    if current_y is None or (abs(y - current_y) < 25 and not is_heading):
                        if current_line:
                            prev_end_x = current_line[-1][1]
                            gap = x - prev_end_x
                            if gap > 15:  # SPACE_THRESHOLD tăng lên 15
                                current_line.append((x, end_x, " "))
                        current_line.append((x, end_x, text))
                    else:
                        if current_line:
                            line_text = "".join(item[2] for item in sorted(current_line, key=lambda k: k[0]))
                            line_text = clean_text(line_text)
                            if line_text:
                                resume_lines.append(line_text)
                        current_line = [(x, end_x, text)]
                    current_y = y

                if current_line:
                    line_text = "".join(item[2] for item in sorted(current_line, key=lambda k: k[0]))
                    line_text = clean_text(line_text)
                    if line_text:
                        resume_lines.append(line_text)

                resume_text = "\n".join(resume_lines) if resume_lines else "N/A"
                resume_text = remove_duplicates(resume_text)

                print(f"📝 Nội dung văn bản (resume_str): {resume_text}")
                print(f"📝 Đường dẫn SVG (resume_html): {resume_html_path}")

            except Exception as e:
                print(f"❌ Không thể lấy dữ liệu từ SVG: {e}")
        else:
            try:
                for _ in range(3):
                    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
                    time.sleep(2)

                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By_TAG_NAME, "body"))
                )

                elements = driver.find_elements(By_TAG_NAME, "body") + \
                           driver.find_elements(By_TAG_NAME, "div") + \
                           driver.find_elements(By_TAG_NAME, "p")

                resume_text = "\n".join([clean_text(e.text.strip()) for e in elements if e.text.strip()])
                resume_text = remove_duplicates(resume_text)
                resume_html_path = "N/A"

            except Exception as e:
                print(f"⚠ Không tìm thấy nội dung trên {url}: {e}")

        return resume_text, resume_html_path

    except Exception as e:
        print(f"❌ Lỗi khi crawl {url}: {e}")
        return "N/A", "N/A"
# Duyệt qua từng hàng trong DataFrame
for index, row in df.iterrows():
    page_1_url = row['Page_1_Link']
    page_2_url = row['Page_2_Link']
    category = row['Category']
    
    text1, svg1 = fetch_resume_content(page_1_url, f"{index + 1}_1")
    text2, svg2 = fetch_resume_content(page_2_url, f"{index + 1}_2")
    
    resume_str = f"{text1}\n\n{text2}" if text1 != "N/A" and text2 != "N/A" else text1 if text1 != "N/A" else text2 if text2 != "N/A" else "N/A"
    resume_html = f"{svg1}; {svg2}" if svg1 != "N/A" and svg2 != "N/A" else svg1 if svg1 != "N/A" else svg2 if svg2 != "N/A" else "N/A"
    
    data.append([index + 1, resume_str, resume_html, category])
    print(f"✔ Đã lấy dữ liệu: {category}")

🔍 Đang truy cập: https://www.livecareer.com/lcapp/uploads/2023/02/accountant-example-CV-page-1.svg
📄 Đang lấy nội dung file SVG: https://www.livecareer.com/lcapp/uploads/2023/02/accountant-example-CV-page-1.svg
📝 Nội dung văn bản (resume_str): JESICA PHILIPS
(5) 5-5 | Orchard Dark, 14127 example@example.com (5) 5-5
Detail-tormented accountant with eight years effectively maintaining accurate Mary counting information for large-scale financial organizations. History Statement restructuring the format several critical and complicated balance shecount reconciliation decreased the likelihood errors and the time complete the reconciliation. Works closely with executive management complex mergers and acquisition and divestitures.
More Ensuit Quick Does Financial statements
Qualifications specialist expertise Tax Counting Face Quite Specialization Fidgeting SAP Expertise
MBA :Counting And Finance E
education Simon Business School -Rochester, ,06/2017
Bachelor Science :Counting University Roch

KeyboardInterrupt: 

In [None]:
# Đóng trình duyệt
driver.quit()

In [None]:
# Ghi vào file CSV
with open(output_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "resume_str", "resume_html", "category"])
    writer.writerows(data)

print(f"✅ Đã lưu kết quả vào {output_csv}")

✅ Đã lưu kết quả vào D:/BaiDoAnChuyenNganh3/Automated-Resume-Ranking-System-main/csvfiles/crawlcv/final_cv1links_resumes.csv
