In [None]:
the developing code 

In [15]:
import os
import fitz  # PyMuPDF
import pymupdf4llm
from docx2python import docx2python
from sentence_transformers import SentenceTransformer, util
import spacy
import re

# ================================
# UNIVERSAL TEXT EXTRACTION
# ================================
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_pdf_blocks(file_path)
    elif ext in [".docx", ".doc"]:
        return extract_docx_exact_layout(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

def extract_docx_exact_layout(filepath):
    doc_result = docx2python(filepath)
    main_content = doc_result.body
    all_text = []
    for section in main_content:
        for row in section:
            for cell in row:
                for para in cell:
                    para_str = para.strip()
                    if para_str:
                        all_text.append(para_str)
    return all_text

def extract_column_aware_blocks(pdf_path, column_gap=50):
    doc = fitz.open(pdf_path)
    all_blocks = []
    for page in doc:
        blocks = page.get_text("blocks", sort=True)
        blocks = sorted(blocks, key=lambda b: (b[0], b[1]))
        left_col, right_col = [], []
        if blocks:
            page_width = page.rect.width
            center_line = page_width / 2
            for b in blocks:
                x0, y0, x1, y1, text, *_ = b
                if x1 < center_line - column_gap:
                    left_col.append((y0, text.strip()))
                else:
                    right_col.append((y0, text.strip()))
            left_col_sorted = [t for _, t in sorted(left_col)]
            right_col_sorted = [t for _, t in sorted(right_col)]
            combined = right_col_sorted + left_col_sorted
            all_blocks.extend([t for t in combined if t])
    doc.close()
    return all_blocks

def extract_pdf_blocks(pdf_path):
    blocks_code1 = extract_column_aware_blocks(pdf_path)
    md_text = pymupdf4llm.to_markdown(pdf_path)
    blocks_code2 = [
        block.strip()
        for block in md_text.split("\n\n")
        if block.strip() and len(block.strip()) > 5
    ]
    md_chunks = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
    page_chunk_blocks = [
        chunk["text"].strip()
        for chunk in md_chunks
        if isinstance(chunk, dict) and "text" in chunk and chunk["text"].strip()
    ]
    def jaccard_similarity(s1, s2, threshold=0.7):
        set1, set2 = set(s1.lower().strip()), set(s2.lower().strip())
        intersection = set1 & set2
        union = set1 | set2
        if not union:
            return False
        return len(intersection) / len(union) > threshold
    missing_from_code2 = []
    for block1 in blocks_code1:
        if not any(jaccard_similarity(block1, block2) for block2 in blocks_code2):
            missing_from_code2.append(block1)
    final_blocks = blocks_code2 + missing_from_code2
    return final_blocks, page_chunk_blocks

# ================================
# SEMANTIC HEADING DETECTION + SECTION GROUPING (Occurrence aware!)
# ================================
resume_headings = [
    "experience", "work experience", "education", "academic history",
    "projects", "technical projects", "skills", "key skills",
    "certifications", "achievements", "publications", "personal details", "contact",
    "technical skills", "professional experience", "core competencies", "community service",
    "portfolio management project", "project director"
]

jd_headings = [
    "job position", "role title", "job responsibilities", "responsibilities",
    "duties", "minimum qualifications", "requirements", "qualifications",
    "preferred qualifications", "desired skills"
]

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
resume_embeddings = model.encode(resume_headings, convert_to_tensor=True)
jd_embeddings = model.encode(jd_headings, convert_to_tensor=True)

def semantic_heading_detection(lines, heading_list, embeddings, threshold=0.6):
    detected_headings = []
    for line in lines:
        line_clean = line.strip().lower()
        if not line_clean or len(line_clean) < 3:
            continue
        emb = model.encode(line_clean, convert_to_tensor=True)
        cosine_scores = util.cos_sim(emb, embeddings)[0]
        max_score = float(cosine_scores.max())
        if max_score >= threshold:
            detected_headings.append(line.strip())
    return detected_headings

def semantic_sectioning(lines, for_jd=False, threshold=0.55):
    headings = jd_headings if for_jd else resume_headings
    embeddings = jd_embeddings if for_jd else resume_embeddings
    detected_headings = semantic_heading_detection(lines, headings, embeddings, threshold)
    sections = {}
    current_section = None
    buffer = []
    section_counter = {}
    for line in lines:
        if line.strip() in detected_headings:
            if current_section and buffer:
                key = current_section
                count = section_counter.get(key, 0) + 1
                section_counter[key] = count
                section_key = f"{key} ({count})"
                sections[section_key] = "\n".join(buffer).strip()
            current_section = line.strip()
            buffer = []
        else:
            buffer.append(line.strip())
    if current_section and buffer:
        key = current_section
        count = section_counter.get(key, 0) + 1
        section_counter[key] = count
        section_key = f"{key} ({count})"
        sections[section_key] = "\n".join(buffer).strip()
    return sections

# ================================
# SPA-CY FEATURE EXTRACTION (with numbers)
# ================================
nlp = spacy.load("en_core_web_md")

def filter_and_clean_noun_chunks(doc):
    seen = set()
    clean_chunks = []
    for chunk in doc.noun_chunks:
        text = chunk.text.strip().lower()
        if not text or len(text) < 2:
            continue
        if all(token.is_stop for token in chunk):
            continue
        if text in seen:
            continue
        seen.add(text)
        clean_chunks.append(text)
    return clean_chunks

def extract_section_spacy_features(sections):
    section_features = {}
    for heading, text in sections.items():
        flat = " ".join(line.strip() for line in text.splitlines() if line.strip())
        doc = nlp(flat)
        noun_chunks = filter_and_clean_noun_chunks(doc)
        verbs = sorted(set([t.lemma_ for t in doc if t.pos_ == "VERB" and not t.is_stop and len(t.lemma_) > 1]))
        compounds = []
        for chunk in doc.noun_chunks:
            if any(t.dep_ == "compound" for t in chunk):
                compound_text = chunk.text.strip().lower()
                if compound_text not in compounds:
                    compounds.append(compound_text)
        verbal_nouns = sorted(set([t.text for t in doc if t.tag_ == "VBG" and t.pos_ == "NOUN"]))
        dates = sorted(set([ent.text for ent in doc.ents if ent.label_ == "DATE"]))
        proper_nouns = sorted(set([t.text for t in doc if t.pos_ == "PROPN" and len(t.text.strip()) > 1]))
        # Capture numbers (tokens with POS NUM and entities with relevant labels)
        numbers = set([t.text for t in doc if t.pos_ == "NUM"])
        numbers = numbers.union({ent.text for ent in doc.ents if ent.label_ in ["CARDINAL", "QUANTITY", "ORDINAL", "MONEY"]})
        section_features[heading] = {
            "noun_chunks": noun_chunks,
            "compounds": compounds,
            "verbal_nouns": verbal_nouns,
            "verbs": verbs,
            "dates": dates,
            "proper_nouns": proper_nouns,
            "numbers": sorted(numbers)
        }
    return section_features

def print_section_parts_of_speech(sections):
    for heading, text in sections.items():
        print(f"\n### PARTS OF SPEECH IN SECTION: {heading.upper()} ###\n")
        doc = nlp(text)
        for token in doc:
            print(f"{token.text}\t{token.pos_}")
        print("=" * 60)

# ================================
# MAIN - ORDERED TAGGED EXTRACT (now tags numbers!)
# ================================
def tag_section_features_in_order(sections, section_features):
    for heading, text in sections.items():
        print(f"\n### {heading.upper()} ###\n")
        lines = text.splitlines()
        features = section_features[heading]
        # Add "numbers" to feat_types!
        feat_types = ['noun_chunks', 'compounds', 'verbal_nouns', 'verbs', 'dates', 'proper_nouns', 'numbers']
        feat_map = {}
        for ft in feat_types:
            for val in features.get(ft, []):
                val_low = val.lower().strip()
                if val_low not in feat_map:
                    feat_map[val_low] = []
                feat_map[val_low].append(ft)
        phrases_sorted = sorted(feat_map.keys(), key=lambda x: -len(x))
        for line in lines:
            line_str = line.strip()
            output = line_str
            for phrase in phrases_sorted:
                if phrase and phrase in line_str.lower():
                    tag = ','.join(feat_map[phrase])
                    pat = r'(?i)\b({})\b'.format(re.escape(phrase))
                    output = re.sub(pat, r'[\1|{}]'.format(tag), output)
            print(output)
        print("=" * 60)

# ================================
# MAIN APPLICATION
# ================================
def process_files(resume_path, jd_path):
    resume_extracted = extract_text(resume_path)
    if isinstance(resume_extracted, tuple):
        resume_blocks, _ = resume_extracted
    else:
        resume_blocks = resume_extracted

    jd_extracted = extract_text(jd_path)
    if isinstance(jd_extracted, tuple):
        jd_blocks, _ = jd_extracted
    else:
        jd_blocks = jd_extracted

    resume_sections = semantic_sectioning(resume_blocks, for_jd=False)
    jd_sections = semantic_sectioning(jd_blocks, for_jd=True)

    resume_features = extract_section_spacy_features(resume_sections)
    jd_features = extract_section_spacy_features(jd_sections)

    print("\n================ RESUME SECTIONS ================\n")
    for heading, content in resume_sections.items():
        print(f"\n### {heading.upper()} ###\n{content}\n")
        print("=" * 60)

    print("\n================ JOB DESCRIPTION SECTIONS ================\n")
    for heading, content in jd_sections.items():
        print(f"\n### {heading.upper()} ###\n{content}\n")
        print("=" * 60)

    print("\n================ SPA-CY FEATURES PER RESUME SECTION ================\n")
    for heading, feats in resume_features.items():
        print(f"\n### {heading.upper()} ###")
        print("Noun Chunks: ", ", ".join(feats["noun_chunks"]))
        print("Compound Nouns: ", ", ".join(feats["compounds"]))
        print("Verbal Nouns: ", ", ".join(feats["verbal_nouns"]))
        print("Verbs: ", ", ".join(feats["verbs"]))
        print("Dates: ", ", ".join(feats["dates"]))
        print("Proper Nouns: ", ", ".join(feats["proper_nouns"]))
        print("Numbers: ", ", ".join(feats["numbers"]))
        print("=" * 60)

    print("\n================ SPA-CY FEATURES PER JD SECTION ================\n")
    for heading, feats in jd_features.items():
        print(f"\n### {heading.upper()} ###")
        print("Noun Chunks: ", ", ".join(feats["noun_chunks"]))
        print("Compound Nouns: ", ", ".join(feats["compounds"]))
        print("Verbal Nouns: ", ", ".join(feats["verbal_nouns"]))
        print("Verbs: ", ", ".join(feats["verbs"]))
        print("Dates: ", ", ".join(feats["dates"]))
        print("Proper Nouns: ", ", ".join(feats["proper_nouns"]))
        print("Numbers: ", ", ".join(feats["numbers"]))
        print("=" * 60)

    print("\n================ ORDERED TAGGED SEMANTIC EXTRACT (RESUME) ================\n")
    tag_section_features_in_order(resume_sections, resume_features)

    print("\n================ ORDERED TAGGED SEMANTIC EXTRACT (JOB DESCRIPTION) ================\n")
    tag_section_features_in_order(jd_sections, jd_features)

    # If desired, print raw POS tagging too:
    # print_section_parts_of_speech(resume_sections)
    # print_section_parts_of_speech(jd_sections)

if __name__ == "__main__":
    resume_path = "3590975-restaurant-server-job-description-resume-example.pdf"         # Replace with your resume file path
    jd_path = "C:\\Users\\zenit\\Downloads\\job description\\for 6 word.docx"   # Replace with your job description file path
    process_files(resume_path, jd_path)





### ## **PROFESSIONAL EXPERIENCE** (1) ###
Server
Bistro 56, Chicago, IL | May 2020 - Present
Provide service to 80+ customers daily, consistently achieving a guest
satisfaction rating of 95%
Actively upsell daily specials and premium beverages, contributing to a 10%
increase in average check amounts
Collaborate with kitchen and front-of-house teams to deliver fast and efficient


### SERVICE (1) ###
Server
Downtown Diner, Chicago, IL | June 2017 - April 2020
Ensured guest orders were accurate and delivered in a timely manner,
achieving consistent positive feedback
Assisted in managing guest complaints and resolving issues promptly to
maintain high satisfaction levels
Maintained cleanliness and organization of dining areas, supporting overall
restaurant efficiency


### HIGH SCHOOL DIPLOMA (1) ###
Chicago High School, Chicago, IL | May 2017


### CONTACT (1) ###
(555) 234-5678
rachelmatthews@email.com
LinkedIn | Portfolio
Chicago, IL 60601


### CUSTOMER SERVICE (1) ###
Upselling te