In [None]:
the developing code 

In [3]:
import os
import fitz  # PyMuPDF
import pymupdf4llm
from docx2python import docx2python
from sentence_transformers import SentenceTransformer, util
import spacy

# ================================
# UNIVERSAL TEXT EXTRACTION
# ================================
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_pdf_blocks(file_path)
    elif ext in [".docx", ".doc"]:
        return extract_docx_exact_layout(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

def extract_docx_exact_layout(filepath):
    doc_result = docx2python(filepath)
    main_content = doc_result.body
    all_text = []
    for section in main_content:
        for row in section:
            for cell in row:
                for para in cell:
                    para_str = para.strip()
                    if para_str:
                        all_text.append(para_str)
    return all_text

def extract_column_aware_blocks(pdf_path, column_gap=50):
    doc = fitz.open(pdf_path)
    all_blocks = []
    for page in doc:
        blocks = page.get_text("blocks", sort=True)
        blocks = sorted(blocks, key=lambda b: (b[0], b[1]))
        left_col, right_col = [], []
        if blocks:
            page_width = page.rect.width
            center_line = page_width / 2
            for b in blocks:
                x0, y0, x1, y1, text, *_ = b
                if x1 < center_line - column_gap:
                    left_col.append((y0, text.strip()))
                else:
                    right_col.append((y0, text.strip()))
            left_col_sorted = [t for _, t in sorted(left_col)]
            right_col_sorted = [t for _, t in sorted(right_col)]
            combined = right_col_sorted + left_col_sorted
            all_blocks.extend([t for t in combined if t])
    doc.close()
    return all_blocks

def extract_pdf_blocks(pdf_path):
    blocks_code1 = extract_column_aware_blocks(pdf_path)
    md_text = pymupdf4llm.to_markdown(pdf_path)
    blocks_code2 = [
        block.strip()
        for block in md_text.split("\n\n")
        if block.strip() and len(block.strip()) > 5
    ]
    md_chunks = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
    page_chunk_blocks = [
        chunk["text"].strip()
        for chunk in md_chunks
        if isinstance(chunk, dict) and "text" in chunk and chunk["text"].strip()
    ]

    def jaccard_similarity(s1, s2, threshold=0.7):
        set1, set2 = set(s1.lower().strip()), set(s2.lower().strip())
        intersection = set1 & set2
        union = set1 | set2
        if not union:
            return False
        return len(intersection) / len(union) > threshold

    missing_from_code2 = []
    for block1 in blocks_code1:
        if not any(jaccard_similarity(block1, block2) for block2 in blocks_code2):
            missing_from_code2.append(block1)
    final_blocks = blocks_code2 + missing_from_code2
    return final_blocks, page_chunk_blocks

# ================================
# SEMANTIC HEADING DETECTION + SECTION GROUPING
# ================================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
canonical_headings = [
    "experience", "work experience", "education", "academic history",
    "projects", "technical projects", "skills", "key skills",
    "certifications", "achievements", "publications", "personal details", "contact",
    "technical skills", "professional experience","Core competencies"
]
canonical_embeddings = model.encode(canonical_headings, convert_to_tensor=True)

def semantic_heading_detection(lines, threshold=0.6):
    headings = []
    for line in lines:
        line_clean = line.strip().lower()
        if not line_clean or len(line_clean) < 3:
            continue
        emb = model.encode(line_clean, convert_to_tensor=True)
        cosine_scores = util.cos_sim(emb, canonical_embeddings)[0]
        max_score = float(cosine_scores.max())
        if max_score >= threshold:
            headings.append(line.strip())
    return headings

def semantic_sectioning(lines, threshold=0.55):
    detected_headings = semantic_heading_detection(lines, threshold)
    sections = {}
    current_section = None
    buffer = []
    for line in lines:
        if line.strip() in detected_headings:
            if current_section and buffer:
                sections[current_section] = "\n".join(buffer).strip()
            current_section = line.strip()
            buffer = []
        else:
            buffer.append(line.strip())
    if current_section and buffer:
        sections[current_section] = "\n".join(buffer).strip()
    return sections

# ================================
# SPA-CY FEATURE EXTRACTION
# ================================
nlp = spacy.load("en_core_web_md")

def filter_and_clean_noun_chunks(doc):
    seen = set()
    clean_chunks = []
    for chunk in doc.noun_chunks:
        text = chunk.text.strip().lower()
        if not text or len(text) < 2:
            continue
        if all(token.is_stop for token in chunk):
            continue
        if text in seen:
            continue
        seen.add(text)
        clean_chunks.append(text)
    return clean_chunks

def extract_section_spacy_features(sections):
    section_features = {}
    for heading, text in sections.items():
        flat = " ".join(line.strip() for line in text.splitlines() if line.strip())
        doc = nlp(flat)
        noun_chunks = filter_and_clean_noun_chunks(doc)
        verbs = sorted(set([t.lemma_ for t in doc if t.pos_ == "VERB" and not t.is_stop and len(t.lemma_) > 1]))
        compounds = []
        for chunk in doc.noun_chunks:
            if any(t.dep_ == "compound" for t in chunk):
                compound_text = chunk.text.strip().lower()
                if compound_text not in compounds:
                    compounds.append(compound_text)
        verbal_nouns = sorted(set([t.text for t in doc if t.tag_ == "VBG" and t.pos_ == "NOUN"]))
        dates = sorted(set([ent.text for ent in doc.ents if ent.label_ == "DATE"]))
        section_features[heading] = {
            "noun_chunks": noun_chunks,
            "compounds": compounds,
            "verbal_nouns": verbal_nouns,
            "verbs": verbs,
            "dates": dates
        }
    return section_features

# ================================
# MAIN - ORDERED TAGGED EXTRACT
# ================================
def tag_section_features_in_order(sections, section_features):
    for heading, text in sections.items():
        print(f"\n### {heading.upper()} ###\n")
        # For each sentence/line, scan for presence of feature and tag for each word/phrase
        lines = text.splitlines()
        features = section_features[heading]
        
        # Get all unique possible feature strings (lowercase for matching)
        feat_types = ['noun_chunks', 'compounds', 'verbal_nouns', 'verbs', 'dates']
        feat_map = {}
        for ft in feat_types:
            for val in features[ft]:
                val_low = val.lower().strip()
                if val_low not in feat_map:
                    feat_map[val_low] = []
                feat_map[val_low].append(ft)
        
        # For each line, scan word-by-word, tag feature matches (longest first to avoid substrings issue)
        import re
        # Collect all feature phrases, sorted by length descending
        phrases_sorted = sorted(feat_map.keys(), key=lambda x: -len(x))
        for line in lines:
            line_str = line.strip()
            output = line_str
            for phrase in phrases_sorted:
                if phrase and phrase in line_str.lower():
                    tag = ','.join(feat_map[phrase])
                    # Use word boundary for more precise matching
                    pat = r'(?i)\b({})\b'.format(re.escape(phrase))
                    output = re.sub(pat, r'[\1|{}]'.format(tag), output)
            print(output)
        print("=" * 60)

if __name__ == "__main__":
    file_path = "3372246-student-nurse-resume-with-clinical-experience.pdf"
    extracted_output = extract_text(file_path)

    if isinstance(extracted_output, tuple):
        final_blocks, page_chunks = extracted_output
    else:
        final_blocks = extracted_output
        page_chunks = []

    print(f"\nParsed Output from: {os.path.basename(file_path)}")
    print("\n================ RAW PARSED TEXT BLOCKS ================\n")
    for block in final_blocks:
        print(block)
        print("-" * 40)

    print("\n================ SEMANTIC SECTIONING OUTPUT ================\n")
    sections = semantic_sectioning(final_blocks)
    for heading, content in sections.items():
        print(f"\n### {heading.upper()} ###\n{content}\n")
        print("=" * 60)

    print("\n================ SPA-CY FEATURES PER SECTION ================\n")
    section_features = extract_section_spacy_features(sections)
    for heading, feats in section_features.items():
        print(f"\n### {heading.upper()} ###")
        print("Noun Chunks: ", ", ".join(feats["noun_chunks"]))
        print("Compound Nouns: ", ", ".join(feats["compounds"]))
        print("Verbal Nouns: ", ", ".join(feats["verbal_nouns"]))
        print("Verbs: ", ", ".join(feats["verbs"]))
        print("Dates: ", ", ".join(feats["dates"]))
        print("=" * 60)

    print("\n================ ORDERED TAGGED SEMANTIC EXTRACT ================\n")
    tag_section_features_in_order(sections, section_features)



Parsed Output from: 6 word resume.docx


Sample Resume
----------------------------------------
SANJAY GOPAL
----------------------------------------
54 Dunster Street    Cambridge, MA 02138   555-555-5555    you@gmail.com
----------------------------------------
Project Director
----------------------------------------
Project Director with extensive leadership experience in highly competitive IT and Telecom industry. Proven track record of leading and managing multi-million-dollar international programs across northern Europe, Middle East, North America and South America.
----------------------------------------
Specialize in launching new services and products from concept to roll-out and building organizations from ground up. Expertise in improving team performance while securing customer loyalty and forging valuable relationships with internal and external partners.
----------------------------------------
Core Competencies
----------------------------------------
• 	Project/O

In [10]:
import os
import re
import fitz  # PyMuPDF
import pymupdf4llm
from docx2python import docx2python
from sentence_transformers import SentenceTransformer, util
import spacy

# ================================
# UNIVERSAL TEXT EXTRACTION
# ================================
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_pdf_blocks(file_path)
    elif ext in [".docx", ".doc"]:
        # Use robust docx2python block extraction for section-aware parsing
        return extract_docx_sections(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

# ================================
# IMPROVED DOCX SECTION EXTRACTION
# ================================
def extract_docx_sections(filepath):
    doc_result = docx2python(filepath)
    main_content = doc_result.body
    sections = []
    section_heading = None
    section_content = []
    section_blocks = []

    for outer in main_content:
        for row in outer:
            for cell in row:
                for para in cell:
                    para_str = para.strip()
                    # Heuristic: treat all-uppercase non-trivial lines as section headings
                    if para_str.isupper() and len(para_str) > 2: 
                        if section_heading or section_content:
                            section_blocks.append((section_heading, '\n'.join(section_content).strip()))
                            section_content = []
                        section_heading = para_str
                    else:
                        if para_str:
                            section_content.append(para_str)
    if section_heading or section_content:
        section_blocks.append((section_heading, '\n'.join(section_content).strip()))

    # Compose raw blocks and also a single flattened list for semantic sectioning
    all_text = []
    for heading, content in section_blocks:
        if heading:
            all_text.append(heading)
        if content:
            all_text.extend(content.splitlines())

    return all_text

# ================================
# PDF PARSING (COLUMN-AWARE & MERGED)
# ================================
def extract_column_aware_blocks(pdf_path, column_gap=50):
    doc = fitz.open(pdf_path)
    all_blocks = []
    for page in doc:
        blocks = page.get_text("blocks", sort=True)
        blocks = sorted(blocks, key=lambda b: (b[0], b[1]))
        left_col, right_col = [], []
        if blocks:
            page_width = page.rect.width
            center_line = page_width / 2
            for b in blocks:
                x0, y0, x1, y1, text, *_ = b
                if x1 < center_line - column_gap:
                    left_col.append((y0, text.strip()))
                else:
                    right_col.append((y0, text.strip()))
            left_col_sorted = [t for _, t in sorted(left_col)]
            right_col_sorted = [t for _, t in sorted(right_col)]
            combined = right_col_sorted + left_col_sorted
            all_blocks.extend([t for t in combined if t])
    doc.close()
    return all_blocks

def extract_pdf_blocks(pdf_path):
    blocks_code1 = extract_column_aware_blocks(pdf_path)
    md_text = pymupdf4llm.to_markdown(pdf_path)
    blocks_code2 = [
        block.strip()
        for block in md_text.split("\n\n")
        if block.strip() and len(block.strip()) > 5
    ]
    md_chunks = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
    page_chunk_blocks = [
        chunk["text"].strip()
        for chunk in md_chunks
        if isinstance(chunk, dict) and "text" in chunk and chunk["text"].strip()
    ]

    def jaccard_similarity(s1, s2, threshold=0.7):
        set1, set2 = set(s1.lower().strip()), set(s2.lower().strip())
        intersection = set1 & set2
        union = set1 | set2
        if not union:
            return False
        return len(intersection) / len(union) > threshold

    missing_from_code2 = []
    for block1 in blocks_code1:
        if not any(jaccard_similarity(block1, block2) for block2 in blocks_code2):
            missing_from_code2.append(block1)
    final_blocks = blocks_code2 + missing_from_code2
    return final_blocks, page_chunk_blocks

# ================================
# SEMANTIC HEADING DETECTION + SECTION GROUPING
# ================================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
canonical_headings = [
    "experience", "work experience", "education", "academic history",
    "projects", "technical projects", "skills", "key skills",
    "certifications", "achievements", "publications", "personal details", "contact",
    "technical skills", "professional experience"
]
canonical_embeddings = model.encode(canonical_headings, convert_to_tensor=True)

def semantic_heading_detection(lines, threshold=0.55):
    headings = []
    for line in lines:
        line_clean = line.strip().lower()
        if not line_clean or len(line_clean) < 3:
            continue
        emb = model.encode(line_clean, convert_to_tensor=True)
        cosine_scores = util.cos_sim(emb, canonical_embeddings)[0]
        max_score = float(cosine_scores.max())
        if max_score >= threshold:
            headings.append(line.strip())
    return headings

def semantic_sectioning(lines, threshold=0.55):
    detected_headings = semantic_heading_detection(lines, threshold)
    sections = {}
    current_section = None
    buffer = []

    for line in lines:
        if line.strip() in detected_headings:
            if current_section and buffer:
                sections[current_section] = "\n".join(buffer).strip()
            current_section = line.strip()
            buffer = []
        else:
            buffer.append(line.strip())

    if current_section and buffer:
        sections[current_section] = "\n".join(buffer).strip()
    return sections

# ================================
# ENHANCED NOUN CHUNK EXTRACTION
# ================================
import spacy
nlp = spacy.load("en_core_web_md")

def filter_and_clean_noun_chunks(doc):
    seen = set()
    clean_chunks = []
    for chunk in doc.noun_chunks:
        text = chunk.text.strip().lower()
        if not text or len(text) < 2:
            continue
        if all(token.is_stop for token in chunk):
            continue
        if text in seen:
            continue
        seen.add(text)
        clean_chunks.append(text)
    return clean_chunks

def extract_section_noun_chunks(sections):
    section_chunks = {}
    for heading, text in sections.items():
        flat = " ".join(line.strip() for line in text.splitlines() if line.strip())
        doc = nlp(flat)
        noun_chunks = filter_and_clean_noun_chunks(doc)
        section_chunks[heading] = noun_chunks
    return section_chunks

# ================================
# DRIVER
# ================================
if __name__ == "__main__":
    file_path = "C:\\Users\\zenit\\Downloads\\1 word resume.docx"
    extracted_output = extract_text(file_path)

    if isinstance(extracted_output, tuple):
        final_blocks, page_chunks = extracted_output
    else:
        final_blocks = extracted_output
        page_chunks = []

    print(f"\nParsed Output from: {os.path.basename(file_path)}")
    print("\n================ RAW PARSED TEXT BLOCKS ================\n")
    for block in final_blocks:
        print(block)
        print("-" * 40)

    print("\n================ SEMANTIC SECTIONING OUTPUT ================\n")
    sections = semantic_sectioning(final_blocks)

    for heading, content in sections.items():
        print(f"\n### {heading.upper()} ###\n{content}\n")
        print("=" * 60)

    print("\n================ NOUN CHUNKS PER SECTION ================\n")
    section_chunks = extract_section_noun_chunks(sections)
    for heading, chunks in section_chunks.items():
        print(f"\n### {heading.upper()} ###")
        print(", ".join(chunks))
        print("=" * 60)



Parsed Output from: 1 word resume.docx


Sample Resume
----------------------------------------
Jin Wang
----------------------------------------
email@gmail.com • (555) 555-5555
----------------------------------------
Education
----------------------------------------
Harvard University, Extension School
----------------------------------------
Master of Liberal Arts, Information Management Systems GPA 4.0
----------------------------------------
--	Class Marshall Award
----------------------------------------
--	Dean’s List Academic Achievement Award
----------------------------------------
--	Data Science Project: Financial Market Analysis Using Machine Learning
----------------------------------------
--	Capstone Project: Enterprise Data Lake
----------------------------------------
University of Malaya
----------------------------------------
Bachelor of Computer Science
----------------------------------------
Technical Skills
----------------------------------------
Harvard Un

In [2]:
import os
import fitz  # PyMuPDF
import pymupdf4llm
from docx2python import docx2python
from sentence_transformers import SentenceTransformer, util
import spacy

# ================================
# UNIVERSAL TEXT EXTRACTION
# ================================
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_pdf_blocks(file_path)
    elif ext in [".docx", ".doc"]:
        # Return exact layout: line-by-line, order preserved, no grouping
        return extract_docx_exact_layout(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

# ================================
# DOCX LINE-BY-LINE ORDER-PRESERVING EXTRACTION
# ================================
def extract_docx_exact_layout(filepath):
    doc_result = docx2python(filepath)
    main_content = doc_result.body
    all_text = []
    for section in main_content:
        for row in section:
            for cell in row:
                for para in cell:
                    para_str = para.strip()
                    if para_str:
                        all_text.append(para_str)
    return all_text

# ================================
# PDF PARSING (COLUMN-AWARE & MERGED)
# ================================
def extract_column_aware_blocks(pdf_path, column_gap=50):
    doc = fitz.open(pdf_path)
    all_blocks = []
    for page in doc:
        blocks = page.get_text("blocks", sort=True)
        blocks = sorted(blocks, key=lambda b: (b[0], b[1]))
        left_col, right_col = [], []
        if blocks:
            page_width = page.rect.width
            center_line = page_width / 2
            for b in blocks:
                x0, y0, x1, y1, text, *_ = b
                if x1 < center_line - column_gap:
                    left_col.append((y0, text.strip()))
                else:
                    right_col.append((y0, text.strip()))
            left_col_sorted = [t for _, t in sorted(left_col)]
            right_col_sorted = [t for _, t in sorted(right_col)]
            combined = right_col_sorted + left_col_sorted
            all_blocks.extend([t for t in combined if t])
    doc.close()
    return all_blocks

def extract_pdf_blocks(pdf_path):
    blocks_code1 = extract_column_aware_blocks(pdf_path)
    md_text = pymupdf4llm.to_markdown(pdf_path)
    blocks_code2 = [
        block.strip()
        for block in md_text.split("\n\n")
        if block.strip() and len(block.strip()) > 5
    ]
    md_chunks = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
    page_chunk_blocks = [
        chunk["text"].strip()
        for chunk in md_chunks
        if isinstance(chunk, dict) and "text" in chunk and chunk["text"].strip()
    ]

    def jaccard_similarity(s1, s2, threshold=0.7):
        set1, set2 = set(s1.lower().strip()), set(s2.lower().strip())
        intersection = set1 & set2
        union = set1 | set2
        if not union:
            return False
        return len(intersection) / len(union) > threshold

    missing_from_code2 = []
    for block1 in blocks_code1:
        if not any(jaccard_similarity(block1, block2) for block2 in blocks_code2):
            missing_from_code2.append(block1)
    final_blocks = blocks_code2 + missing_from_code2
    return final_blocks, page_chunk_blocks

# ================================
# SEMANTIC HEADING DETECTION + SECTION GROUPING
# ================================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
canonical_headings = [
    "experience", "work experience", "education", "academic history",
    "projects", "technical projects", "skills", "key skills",
    "certifications", "achievements", "publications", "personal details", "contact",
    "technical skills", "professional experience","core competencies"
]
canonical_embeddings = model.encode(canonical_headings, convert_to_tensor=True)

def semantic_heading_detection(lines, threshold=0.6):
    headings = []
    for line in lines:
        line_clean = line.strip().lower()
        if not line_clean or len(line_clean) < 3:
            continue
        emb = model.encode(line_clean, convert_to_tensor=True)
        cosine_scores = util.cos_sim(emb, canonical_embeddings)[0]
        max_score = float(cosine_scores.max())
        if max_score >= threshold:
            headings.append(line.strip())
    return headings

def semantic_sectioning(lines, threshold=0.55):
    detected_headings = semantic_heading_detection(lines, threshold)
    sections = {}
    current_section = None
    buffer = []

    for line in lines:
        if line.strip() in detected_headings:
            if current_section and buffer:
                sections[current_section] = "\n".join(buffer).strip()
            current_section = line.strip()
            buffer = []
        else:
            buffer.append(line.strip())

    if current_section and buffer:
        sections[current_section] = "\n".join(buffer).strip()
    return sections

# ================================
# ENHANCED NOUN CHUNK EXTRACTION
# ================================
nlp = spacy.load("en_core_web_md")

def extract_features_per_section(sections):
    section_features = {}
    for heading, text in sections.items():
        # Flatten and clean section text
        flat = " ".join(line.strip() for line in text.splitlines() if line.strip())
        doc = nlp(flat)
        
        # spaCy features
        nouns = set(token.text.strip().lower() for token in doc if token.pos_ == "NOUN")
        noun_chunks = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)
        compound_nouns = set(
            chunk.text.strip().lower()
            for chunk in doc.noun_chunks
            if sum(t.pos_ == "NOUN" or t.dep_ == "compound" for t in chunk) > 1
        )
        verbs = set(token.lemma_.strip().lower() for token in doc if token.pos_ == "VERB")
        verbal_nouns = set(
            token.text.strip().lower()
            for token in doc
            if token.tag_ == "VBG" or (
                token.pos_ == "NOUN" and nlp(token.lemma_)[0].pos_ == "VERB"
            )
        )
        
        section_features[heading] = {
            "nouns": sorted(nouns),
            "noun_chunks": sorted(noun_chunks),
            "compound_nouns": sorted(compound_nouns),
            "verbs": sorted(verbs),
            "verbal_nouns": sorted(verbal_nouns)
        }
    return section_features

# ================================
# DRIVER
# ================================
if __name__ == "__main__":
    file_path = "C:\\Users\\zenit\\Downloads\\9 word resume.docx"
    extracted_output = extract_text(file_path)

    if isinstance(extracted_output, tuple):
        final_blocks, page_chunks = extracted_output
    else:
        final_blocks = extracted_output
        page_chunks = []

    print(f"\nParsed Output from: {os.path.basename(file_path)}")
    print("\n================ RAW PARSED TEXT BLOCKS ================\n")
    for block in final_blocks:
        print(block)
        print("-" * 40)

    print("\n================ SEMANTIC SECTIONING OUTPUT ================\n")
    sections = semantic_sectioning(final_blocks)
    for heading, content in sections.items():
        print(f"\n### {heading.upper()} ###\n{content}\n")
        print("=" * 60)

    print("\n================ SPACY FEATURES PER SECTION ================\n")
    section_features = extract_features_per_section(sections)
    for heading, feats in section_features.items():
        print(f"\n### {heading.upper()} ###")
        print("Nouns: " + ", ".join(feats["nouns"]))
        print("Noun Chunks: " + ", ".join(feats["noun_chunks"]))
        print("Compound Nouns: " + ", ".join(feats["compound_nouns"]))
        print("Verbs: " + ", ".join(feats["verbs"]))
        print("Verbal Nouns: " + ", ".join(feats["verbal_nouns"]))
        print("=" * 60)



Parsed Output from: 9 word resume.docx


Sample Resume
----------------------------------------
Georgina Santiago
----------------------------------------
Cambridge, MA / 555-555-5555 / you@gmail.com / www.linkedin.com/in/profile
----------------------------------------
EDUCATION
----------------------------------------
Harvard University Extension School
----------------------------------------
Cambridge, MA
----------------------------------------
Bachelor of Liberal Arts, Field of Study Economics
----------------------------------------
Cum Laude, Dean’s List, GPA 3.62
----------------------------------------
Worked up to 40+ hours a week to defray cost of tuition
----------------------------------------
May 2016
----------------------------------------
EXPERIENCE
----------------------------------------
Hangtime Wholesale Wine Company 	Boston, MA
----------------------------------------
Sales Representative 	20XX-Present
----------------------------------------
Opened and maintain

In [None]:
llm verify 

In [6]:
import os
import fitz  # PyMuPDF
import pymupdf4llm
from docx2python import docx2python
from sentence_transformers import SentenceTransformer, util
import spacy

# ================================
# UNIVERSAL TEXT EXTRACTION
# ================================
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_pdf_blocks(file_path)
    elif ext in [".docx", ".doc"]:
        # Return exact layout: line-by-line, order preserved, no grouping
        return extract_docx_exact_layout(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

# ================================
# DOCX LINE-BY-LINE ORDER-PRESERVING EXTRACTION
# ================================
def extract_docx_exact_layout(filepath):
    doc_result = docx2python(filepath)
    main_content = doc_result.body
    all_text = []
    for section in main_content:
        for row in section:
            for cell in row:
                for para in cell:
                    para_str = para.strip()
                    if para_str:
                        all_text.append(para_str)
    return all_text

# ================================
# PDF PARSING (COLUMN-AWARE & MERGED)
# ================================
def extract_column_aware_blocks(pdf_path, column_gap=50):
    doc = fitz.open(pdf_path)
    all_blocks = []
    for page in doc:
        blocks = page.get_text("blocks", sort=True)
        blocks = sorted(blocks, key=lambda b: (b[0], b[1]))
        left_col, right_col = [], []
        if blocks:
            page_width = page.rect.width
            center_line = page_width / 2
            for b in blocks:
                x0, y0, x1, y1, text, *_ = b
                if x1 < center_line - column_gap:
                    left_col.append((y0, text.strip()))
                else:
                    right_col.append((y0, text.strip()))
            left_col_sorted = [t for _, t in sorted(left_col)]
            right_col_sorted = [t for _, t in sorted(right_col)]
            combined = right_col_sorted + left_col_sorted
            all_blocks.extend([t for t in combined if t])
    doc.close()
    return all_blocks

def extract_pdf_blocks(pdf_path):
    blocks_code1 = extract_column_aware_blocks(pdf_path)
    md_text = pymupdf4llm.to_markdown(pdf_path)
    blocks_code2 = [
        block.strip()
        for block in md_text.split("\n\n")
        if block.strip() and len(block.strip()) > 5
    ]
    md_chunks = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
    page_chunk_blocks = [
        chunk["text"].strip()
        for chunk in md_chunks
        if isinstance(chunk, dict) and "text" in chunk and chunk["text"].strip()
    ]

    def jaccard_similarity(s1, s2, threshold=0.7):
        set1, set2 = set(s1.lower().strip()), set(s2.lower().strip())
        intersection = set1 & set2
        union = set1 | set2
        if not union:
            return False
        return len(intersection) / len(union) > threshold

    missing_from_code2 = []
    for block1 in blocks_code1:
        if not any(jaccard_similarity(block1, block2) for block2 in blocks_code2):
            missing_from_code2.append(block1)
    final_blocks = blocks_code2 + missing_from_code2
    return final_blocks, page_chunk_blocks

# ================================
# SEMANTIC HEADING DETECTION + SECTION GROUPING
# ================================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
canonical_headings = [
    "experience", "work experience", "education", "academic history",
    "projects", "technical projects", "skills", "key skills",
    "certifications", "achievements", "publications", "personal details", "contact",
    "technical skills", "professional experience","Core competencies"
]
canonical_embeddings = model.encode(canonical_headings, convert_to_tensor=True)

def semantic_heading_detection(lines, threshold=0.6):
    headings = []
    for line in lines:
        line_clean = line.strip().lower()
        if not line_clean or len(line_clean) < 3:
            continue
        emb = model.encode(line_clean, convert_to_tensor=True)
        cosine_scores = util.cos_sim(emb, canonical_embeddings)[0]
        max_score = float(cosine_scores.max())
        if max_score >= threshold:
            headings.append(line.strip())
    return headings

def semantic_sectioning(lines, threshold=0.55):
    detected_headings = semantic_heading_detection(lines, threshold)
    sections = {}
    current_section = None
    buffer = []

    for line in lines:
        if line.strip() in detected_headings:
            if current_section and buffer:
                sections[current_section] = "\n".join(buffer).strip()
            current_section = line.strip()
            buffer = []
        else:
            buffer.append(line.strip())

    if current_section and buffer:
        sections[current_section] = "\n".join(buffer).strip()
    return sections

# ================================
# ENHANCED NOUN CHUNK CLEANING
# ================================
nlp = spacy.load("en_core_web_md")

def filter_and_clean_noun_chunks(doc):
    seen = set()
    clean_chunks = []
    for chunk in doc.noun_chunks:
        text = chunk.text.strip().lower()
        if not text or len(text) < 2:
            continue
        if all(token.is_stop for token in chunk):
            continue
        if text in seen:
            continue
        seen.add(text)
        clean_chunks.append(text)
    return clean_chunks

# ================================
# VERBS, NOUN CHUNKS, COMPOUND NOUNS, GERUNDS, DATES PER SECTION
# ================================
def extract_section_spacy_features(sections):
    section_features = {}
    for heading, text in sections.items():
        flat = " ".join(line.strip() for line in text.splitlines() if line.strip())
        doc = nlp(flat)
        # Noun chunks
        noun_chunks = filter_and_clean_noun_chunks(doc)
        # Verbs (lemma, no stopwords, minlen 2)
        verbs = sorted(set([t.lemma_ for t in doc if t.pos_ == "VERB" and not t.is_stop and len(t.lemma_) > 1]))
        # Compound nouns (noun chunk with any compound token)
        compounds = []
        for chunk in doc.noun_chunks:
            if any(t.dep_ == "compound" for t in chunk):
                compound_text = chunk.text.strip().lower()
                if compound_text not in compounds:
                    compounds.append(compound_text)
        # Verbal nouns (gerunds: tag_ == VBG, pos_ == NOUN)
        verbal_nouns = sorted(set([t.text for t in doc if t.tag_ == "VBG" and t.pos_ == "NOUN"]))
        # Dates (NER)
        dates = sorted(set([ent.text for ent in doc.ents if ent.label_ == "DATE"]))
        section_features[heading] = {
            "noun_chunks": noun_chunks,
            "compounds": compounds,
            "verbal_nouns": verbal_nouns,
            "verbs": verbs,
            "dates": dates
        }
    return section_features

# ================================
# DRIVER
# ================================
if __name__ == "__main__":
    file_path = "9 word resume.pdf"
    extracted_output = extract_text(file_path)

    if isinstance(extracted_output, tuple):
        final_blocks, page_chunks = extracted_output
    else:
        final_blocks = extracted_output
        page_chunks = []

    print(f"\nParsed Output from: {os.path.basename(file_path)}")
    print("\n================ RAW PARSED TEXT BLOCKS ================\n")
    for block in final_blocks:
        print(block)
        print("-" * 40)

    print("\n================ SEMANTIC SECTIONING OUTPUT ================\n")
    sections = semantic_sectioning(final_blocks)
    for heading, content in sections.items():
        print(f"\n### {heading.upper()} ###\n{content}\n")
        print("=" * 60)

    print("\n================ SPA-CY FEATURES PER SECTION ================\n")
    section_features = extract_section_spacy_features(sections)
    for heading, feats in section_features.items():
        print(f"\n### {heading.upper()} ###")
        print("Noun Chunks: ", ", ".join(feats["noun_chunks"]))
        print("Compound Nouns: ", ", ".join(feats["compounds"]))
        print("Verbal Nouns: ", ", ".join(feats["verbal_nouns"]))
        print("Verbs: ", ", ".join(feats["verbs"]))
        print("Dates: ", ", ".join(feats["dates"]))
        print("=" * 60)



Parsed Output from: 9 word resume.pdf


# Sample Resume
----------------------------------------
### **Georgina Santiago**
----------------------------------------
Cambridge, MA / 555-555-5555 / you@gmail.com / www.linkedin.com/in/profile
----------------------------------------
**EDUCATION**
**Harvard University Extension School** Cambridge, MA
Bachelor of Liberal Arts, Field of Study Economics May 2016
Cum Laude, Dean’s List, GPA 3.62
Worked up to 40+ hours a week to defray cost of tuition
----------------------------------------
**EXPERIENCE**
----------------------------------------
**Hangtime Wholesale Wine Company** Boston, MA
**Sales Representative** 20XX-Present
Opened and maintain 40 accounts in the greater Boston area. Conduct in-store tastings and staff
trainings to generate greater revenue. Create and distribute promotional materials.
----------------------------------------
**Christie’s Auction House** New York, NY
**Intern, Fine and Rare Wine Department** 20XX
Performed 

In [None]:
new code for spacy test 1

In [14]:
import os
import fitz  # PyMuPDF
import pymupdf4llm
from docx2python import docx2python
from sentence_transformers import SentenceTransformer, util
import spacy

# ================================
# UNIVERSAL TEXT EXTRACTION
# ================================
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_pdf_blocks(file_path)
    elif ext in [".docx", ".doc"]:
        return extract_docx_exact_layout(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

def extract_docx_exact_layout(filepath):
    doc_result = docx2python(filepath)
    main_content = doc_result.body
    all_text = []
    for section in main_content:
        for row in section:
            for cell in row:
                for para in cell:
                    para_str = para.strip()
                    if para_str:
                        all_text.append(para_str)
    return all_text

def extract_column_aware_blocks(pdf_path, column_gap=50):
    doc = fitz.open(pdf_path)
    all_blocks = []
    for page in doc:
        blocks = page.get_text("blocks", sort=True)
        blocks = sorted(blocks, key=lambda b: (b[0], b[1]))
        left_col, right_col = [], []
        if blocks:
            page_width = page.rect.width
            center_line = page_width / 2
            for b in blocks:
                x0, y0, x1, y1, text, *_ = b
                if x1 < center_line - column_gap:
                    left_col.append((y0, text.strip()))
                else:
                    right_col.append((y0, text.strip()))
            left_col_sorted = [t for _, t in sorted(left_col)]
            right_col_sorted = [t for _, t in sorted(right_col)]
            combined = right_col_sorted + left_col_sorted
            all_blocks.extend([t for t in combined if t])
    doc.close()
    return all_blocks

def extract_pdf_blocks(pdf_path):
    blocks_code1 = extract_column_aware_blocks(pdf_path)
    md_text = pymupdf4llm.to_markdown(pdf_path)
    blocks_code2 = [
        block.strip()
        for block in md_text.split("\n\n")
        if block.strip() and len(block.strip()) > 5
    ]
    md_chunks = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
    page_chunk_blocks = [
        chunk["text"].strip()
        for chunk in md_chunks
        if isinstance(chunk, dict) and "text" in chunk and chunk["text"].strip()
    ]

    def jaccard_similarity(s1, s2, threshold=0.7):
        set1, set2 = set(s1.lower().strip()), set(s2.lower().strip())
        intersection = set1 & set2
        union = set1 | set2
        if not union:
            return False
        return len(intersection) / len(union) > threshold

    missing_from_code2 = []
    for block1 in blocks_code1:
        if not any(jaccard_similarity(block1, block2) for block2 in blocks_code2):
            missing_from_code2.append(block1)
    final_blocks = blocks_code2 + missing_from_code2
    return final_blocks, page_chunk_blocks

# ================================
# SEMANTIC HEADING DETECTION + SECTION GROUPING
# ================================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
canonical_headings = [
    "experience", "work experience", "education", "academic history",
    "projects", "technical projects", "skills", "key skills",
    "certifications", "achievements", "publications", "personal details", "contact",
    "technical skills", "professional experience","Core competencies"
]
canonical_embeddings = model.encode(canonical_headings, convert_to_tensor=True)

def semantic_heading_detection(lines, threshold=0.6):
    headings = []
    for line in lines:
        line_clean = line.strip().lower()
        if not line_clean or len(line_clean) < 3:
            continue
        emb = model.encode(line_clean, convert_to_tensor=True)
        cosine_scores = util.cos_sim(emb, canonical_embeddings)[0]
        max_score = float(cosine_scores.max())
        if max_score >= threshold:
            headings.append(line.strip())
    return headings

def semantic_sectioning(lines, threshold=0.55):
    detected_headings = semantic_heading_detection(lines, threshold)
    sections = {}
    current_section = None
    buffer = []
    for line in lines:
        if line.strip() in detected_headings:
            if current_section and buffer:
                sections[current_section] = "\n".join(buffer).strip()
            current_section = line.strip()
            buffer = []
        else:
            buffer.append(line.strip())
    if current_section and buffer:
        sections[current_section] = "\n".join(buffer).strip()
    return sections

# ================================
# SPA-CY FEATURE EXTRACTION
# ================================
nlp = spacy.load("en_core_web_md")

def filter_and_clean_noun_chunks(doc):
    seen = set()
    clean_chunks = []
    for chunk in doc.noun_chunks:
        text = chunk.text.strip().lower()
        if not text or len(text) < 2:
            continue
        if all(token.is_stop for token in chunk):
            continue
        if text in seen:
            continue
        seen.add(text)
        clean_chunks.append(text)
    return clean_chunks

def extract_section_spacy_features(sections):
    section_features = {}
    for heading, text in sections.items():
        flat = " ".join(line.strip() for line in text.splitlines() if line.strip())
        doc = nlp(flat)
        noun_chunks = filter_and_clean_noun_chunks(doc)
        verbs = sorted(set([t.lemma_ for t in doc if t.pos_ == "VERB" and not t.is_stop and len(t.lemma_) > 1]))
        compounds = []
        for chunk in doc.noun_chunks:
            if any(t.dep_ == "compound" for t in chunk):
                compound_text = chunk.text.strip().lower()
                if compound_text not in compounds:
                    compounds.append(compound_text)
        verbal_nouns = sorted(set([t.text for t in doc if t.tag_ == "VBG" and t.pos_ == "NOUN"]))
        dates = sorted(set([ent.text for ent in doc.ents if ent.label_ == "DATE"]))
        section_features[heading] = {
            "noun_chunks": noun_chunks,
            "compounds": compounds,
            "verbal_nouns": verbal_nouns,
            "verbs": verbs,
            "dates": dates
        }
    return section_features

# ================================
# MAIN - ORDERED TAGGED EXTRACT
# ================================
def tag_section_features_in_order(sections, section_features):
    for heading, text in sections.items():
        print(f"\n### {heading.upper()} ###\n")
        # For each sentence/line, scan for presence of feature and tag for each word/phrase
        lines = text.splitlines()
        features = section_features[heading]
        
        # Get all unique possible feature strings (lowercase for matching)
        feat_types = ['noun_chunks', 'compounds', 'verbal_nouns', 'verbs', 'dates']
        feat_map = {}
        for ft in feat_types:
            for val in features[ft]:
                val_low = val.lower().strip()
                if val_low not in feat_map:
                    feat_map[val_low] = []
                feat_map[val_low].append(ft)
        
        # For each line, scan word-by-word, tag feature matches (longest first to avoid substrings issue)
        import re
        # Collect all feature phrases, sorted by length descending
        phrases_sorted = sorted(feat_map.keys(), key=lambda x: -len(x))
        for line in lines:
            line_str = line.strip()
            output = line_str
            for phrase in phrases_sorted:
                if phrase and phrase in line_str.lower():
                    tag = ','.join(feat_map[phrase])
                    # Use word boundary for more precise matching
                    pat = r'(?i)\b({})\b'.format(re.escape(phrase))
                    output = re.sub(pat, r'[\1|{}]'.format(tag), output)
            print(output)
        print("=" * 60)

if __name__ == "__main__":
    file_path = "3372246-student-nurse-resume-with-clinical-experience.pdf"
    extracted_output = extract_text(file_path)

    if isinstance(extracted_output, tuple):
        final_blocks, page_chunks = extracted_output
    else:
        final_blocks = extracted_output
        page_chunks = []

    print(f"\nParsed Output from: {os.path.basename(file_path)}")
    print("\n================ RAW PARSED TEXT BLOCKS ================\n")
    for block in final_blocks:
        print(block)
        print("-" * 40)

    print("\n================ SEMANTIC SECTIONING OUTPUT ================\n")
    sections = semantic_sectioning(final_blocks)
    for heading, content in sections.items():
        print(f"\n### {heading.upper()} ###\n{content}\n")
        print("=" * 60)

    print("\n================ SPA-CY FEATURES PER SECTION ================\n")
    section_features = extract_section_spacy_features(sections)
    for heading, feats in section_features.items():
        print(f"\n### {heading.upper()} ###")
        print("Noun Chunks: ", ", ".join(feats["noun_chunks"]))
        print("Compound Nouns: ", ", ".join(feats["compounds"]))
        print("Verbal Nouns: ", ", ".join(feats["verbal_nouns"]))
        print("Verbs: ", ", ".join(feats["verbs"]))
        print("Dates: ", ", ".join(feats["dates"]))
        print("=" * 60)

    print("\n================ ORDERED TAGGED SEMANTIC EXTRACT ================\n")
    tag_section_features_in_order(sections, section_features)



Parsed Output from: 3372246-student-nurse-resume-with-clinical-experience.pdf


# Daniel Kim
----------------------------------------
I am a student nurse with extensive clinical rotation experience across various healthcare
settings. I am passionate about patient care and committed to applying clinical best
practices in a fast-paced environment.
----------------------------------------
## **Professional Experience**
----------------------------------------
CLINICAL ROTATIONS [|] MULTIPLE HOSPITALS, HOUSTON, TX
JANUARY 2023 – PRESENT
----------------------------------------
## • Completed over 300 clinical hours in medical-surgical, ICU, and pediatric units. • Assisted RNs with patient assessments and medication administration.
----------------------------------------
VOLUNTEER NURSING ASSISTANT [|] HOUSTON COMMUNITY CLINIC, HOUSTON, TX
JUNE 2022 – DECEMBER 2022
----------------------------------------
## • Provided bedside care and support to 10+ patients per shift. • Documented pati

In [None]:
test 2

In [11]:
import os
import fitz  # PyMuPDF
import pymupdf4llm
from docx2python import docx2python
from sentence_transformers import SentenceTransformer, util
import spacy

# ================================
# UNIVERSAL TEXT EXTRACTION
# ================================
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_pdf_blocks(file_path)
    elif ext in [".docx", ".doc"]:
        return extract_docx_exact_layout(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

def extract_docx_exact_layout(filepath):
    doc_result = docx2python(filepath)
    main_content = doc_result.body
    all_text = []
    for section in main_content:
        for row in section:
            for cell in row:
                for para in cell:
                    para_str = para.strip()
                    if para_str:
                        all_text.append(para_str)
    return all_text

def extract_column_aware_blocks(pdf_path, column_gap=50):
    doc = fitz.open(pdf_path)
    all_blocks = []
    for page in doc:
        blocks = page.get_text("blocks", sort=True)
        blocks = sorted(blocks, key=lambda b: (b[0], b[1]))
        left_col, right_col = [], []
        if blocks:
            page_width = page.rect.width
            center_line = page_width / 2
            for b in blocks:
                x0, y0, x1, y1, text, *_ = b
                if x1 < center_line - column_gap:
                    left_col.append((y0, text.strip()))
                else:
                    right_col.append((y0, text.strip()))
            left_col_sorted = [t for _, t in sorted(left_col)]
            right_col_sorted = [t for _, t in sorted(right_col)]
            combined = right_col_sorted + left_col_sorted
            all_blocks.extend([t for t in combined if t])
    doc.close()
    return all_blocks

def extract_pdf_blocks(pdf_path):
    blocks_code1 = extract_column_aware_blocks(pdf_path)
    md_text = pymupdf4llm.to_markdown(pdf_path)
    blocks_code2 = [
        block.strip()
        for block in md_text.split("\n\n")
        if block.strip() and len(block.strip()) > 5
    ]
    md_chunks = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
    page_chunk_blocks = [
        chunk["text"].strip()
        for chunk in md_chunks
        if isinstance(chunk, dict) and "text" in chunk and chunk["text"].strip()
    ]

    def jaccard_similarity(s1, s2, threshold=0.7):
        set1, set2 = set(s1.lower().strip()), set(s2.lower().strip())
        intersection = set1 & set2
        union = set1 | set2
        if not union:
            return False
        return len(intersection) / len(union) > threshold

    missing_from_code2 = []
    for block1 in blocks_code1:
        if not any(jaccard_similarity(block1, block2) for block2 in blocks_code2):
            missing_from_code2.append(block1)
    final_blocks = blocks_code2 + missing_from_code2
    return final_blocks, page_chunk_blocks

# ================================
# SEMANTIC HEADING DETECTION + SECTION GROUPING
# ================================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
canonical_headings = [
    "experience", "work experience", "education", "academic history",
    "projects", "technical projects", "skills", "key skills",
    "certifications", "achievements", "publications", "personal details", "contact",
    "technical skills", "professional experience","Core competencies"
]
canonical_embeddings = model.encode(canonical_headings, convert_to_tensor=True)

def semantic_heading_detection(lines, threshold=0.6):
    headings = []
    for line in lines:
        line_clean = line.strip().lower()
        if not line_clean or len(line_clean) < 3:
            continue
        emb = model.encode(line_clean, convert_to_tensor=True)
        cosine_scores = util.cos_sim(emb, canonical_embeddings)[0]
        max_score = float(cosine_scores.max())
        if max_score >= threshold:
            headings.append(line.strip())
    return headings

def semantic_sectioning(lines, threshold=0.55):
    detected_headings = semantic_heading_detection(lines, threshold)
    sections = {}
    current_section = None
    buffer = []
    for line in lines:
        if line.strip() in detected_headings:
            if current_section and buffer:
                sections[current_section] = "\n".join(buffer).strip()
            current_section = line.strip()
            buffer = []
        else:
            buffer.append(line.strip())
    if current_section and buffer:
        sections[current_section] = "\n".join(buffer).strip()
    return sections

# ================================
# SPA-CY FEATURE EXTRACTION
# ================================
nlp = spacy.load("en_core_web_md")

def filter_and_clean_noun_chunks(doc):
    seen = set()
    clean_chunks = []
    for chunk in doc.noun_chunks:
        text = chunk.text.strip().lower()
        if not text or len(text) < 2:
            continue
        if all(token.is_stop for token in chunk):
            continue
        if text in seen:
            continue
        seen.add(text)
        clean_chunks.append(text)
    return clean_chunks

def extract_section_spacy_features(sections):
    section_features = {}
    for heading, text in sections.items():
        flat = " ".join(line.strip() for line in text.splitlines() if line.strip())
        doc = nlp(flat)
        noun_chunks = filter_and_clean_noun_chunks(doc)
        verbs = sorted(set([t.lemma_ for t in doc if t.pos_ == "VERB" and not t.is_stop and len(t.lemma_) > 1]))
        compounds = []
        for chunk in doc.noun_chunks:
            if any(t.dep_ == "compound" for t in chunk):
                compound_text = chunk.text.strip().lower()
                if compound_text not in compounds:
                    compounds.append(compound_text)
        verbal_nouns = sorted(set([t.text for t in doc if t.tag_ == "VBG" and t.pos_ == "NOUN"]))
        dates = sorted(set([ent.text for ent in doc.ents if ent.label_ == "DATE"]))
        section_features[heading] = {
            "noun_chunks": noun_chunks,
            "compounds": compounds,
            "verbal_nouns": verbal_nouns,
            "verbs": verbs,
            "dates": dates
        }
    return section_features

# ================================
# MAIN - ONLY FEATURED TERMS OUTPUT
# ================================
def extract_tagged_features_only(sections, section_features):
    import re
    for heading, text in sections.items():
        print(f"\n### {heading.upper()} ###\n")
        lines = text.splitlines()
        features = section_features[heading]

        # Build map of feature strings to type tags
        feat_types = ['noun_chunks', 'compounds', 'verbal_nouns', 'verbs', 'dates']
        feat_map = {}
        for ft in feat_types:
            for val in features[ft]:
                val_low = val.lower().strip()
                if val_low not in feat_map:
                    feat_map[val_low] = []
                feat_map[val_low].append(ft)

        # Sort phrases descending by length for best matching
        phrases_sorted = sorted(feat_map.keys(), key=lambda x: -len(x))
        for line in lines:
            line_str = line.strip()
            output = []
            # For each phrase, use regex to find all matches in original order
            used_spans = []
            for phrase in phrases_sorted:
                pat = r'(?i)\b({})\b'.format(re.escape(phrase))
                for m in re.finditer(pat, line_str):
                    span = m.span()
                    # Avoid overlapping matches
                    if any(span[0] < u[1] and span[1] > u[0] for u in used_spans):
                        continue
                    match_text = m.group(1)
                    tag = ','.join(feat_map[phrase])
                    output.append(f"[{match_text}|{tag}]")
                    used_spans.append(span)
            print(' '.join(output))
        print("=" * 60)

if __name__ == "__main__":
    file_path = "3372246-student-nurse-resume-with-clinical-experience.pdf"
    extracted_output = extract_text(file_path)

    if isinstance(extracted_output, tuple):
        final_blocks, page_chunks = extracted_output
    else:
        final_blocks = extracted_output
        page_chunks = []

    print("\n================ RAW PARSED TEXT BLOCKS ================\n")
    for block in final_blocks:
        print(block)
        print("-" * 40)

    print("\n================ SEMANTIC SECTIONING OUTPUT ================\n")
    sections = semantic_sectioning(final_blocks)
    for heading, content in sections.items():
        print(f"\n### {heading.upper()} ###\n{content}\n")
        print("=" * 60)

    print("\n================ SPA-CY FEATURES PER SECTION ================\n")
    section_features = extract_section_spacy_features(sections)
    for heading, feats in section_features.items():
        print(f"\n### {heading.upper()} ###")
        print("Noun Chunks: ", ", ".join(feats["noun_chunks"]))
        print("Compound Nouns: ", ", ".join(feats["compounds"]))
        print("Verbal Nouns: ", ", ".join(feats["verbal_nouns"]))
        print("Verbs: ", ", ".join(feats["verbs"]))
        print("Dates: ", ", ".join(feats["dates"]))
        print("=" * 60)

    print("\n================ FEATURES ONLY FROM ORDERED TAGGED SEMANTIC EXTRACT ================\n")
    extract_tagged_features_only(sections, section_features)




# Daniel Kim
----------------------------------------
I am a student nurse with extensive clinical rotation experience across various healthcare
settings. I am passionate about patient care and committed to applying clinical best
practices in a fast-paced environment.
----------------------------------------
## **Professional Experience**
----------------------------------------
CLINICAL ROTATIONS [|] MULTIPLE HOSPITALS, HOUSTON, TX
JANUARY 2023 – PRESENT
----------------------------------------
## • Completed over 300 clinical hours in medical-surgical, ICU, and pediatric units. • Assisted RNs with patient assessments and medication administration.
----------------------------------------
VOLUNTEER NURSING ASSISTANT [|] HOUSTON COMMUNITY CLINIC, HOUSTON, TX
JUNE 2022 – DECEMBER 2022
----------------------------------------
## • Provided bedside care and support to 10+ patients per shift. • Documented patient progress and collaborated with the healthcare team.
-----------------------