In [None]:
import pandas as pd
raw_data=pd.read_csv('0_raw_dataset/job_description.csv')
print(len(raw_data))

df = pd.read_csv('0_raw_dataset/cleaned_jd_location.csv')
len(df)

22000


7188

# check noise data & duplicate data

In [70]:
df = df[df["job_description"] != "Please apply only if you are qualified."]
print(len(df))

# there are many duplicate value
dupes = df[df["job_description"].duplicated(keep=False)]
same_rows = dupes.groupby("job_description").filter(
    lambda g: g.drop(columns=["job_description"]).nunique().sum() == 0
)

print(f"Same job description but different companies, locations, etc., after checking: {len(same_rows)}")

# drop them
duplicates = df["job_description"].value_counts()
num_duplicate_rows = duplicates[duplicates > 1].sum()
print("Total number of duplicate job_description lines:", num_duplicate_rows)
# number not too much, drop them
df = df[~df["job_description"].duplicated(keep=False)]
print(len(df))


7158
Same job description but different companies, locations, etc., after checking: 0
Total number of duplicate job_description lines: 1852
5306


# check key features in job description only using keywords

In [71]:
# === Responsibilities / Duties  ===
duties_pattern = (
    r"(?i)\b("
    r"responsibilit(?:y|ies)|"
    r"duties|"
    r"tasks|"
    r"key responsibilities|"
    r"primary responsibilities|"
    r"main duties|"
    r"essential duties|"
    r"job duties|"
    r"core responsibilities|"
    r"what you['’]ll do|"
    r"what you will do|"
    r"your role|"
    r"role overview|"
    r"day[- ]to[- ]day|"
    r"main responsibilities|"
    r"responsibilities|"
    r"key deliverables|"
    r"accountabilities|"
    r"scope of work|"
    r"what this role does"
    r")\b"
)


duties_mask = df["job_description"].str.contains(duties_pattern, case=False, na=False)


# === Requirements / Qualifications ===
req_pattern = (
    r"(?i)\b("
    r"requirements?|"
    r"qualifications?|"
    r"skills and experience|"
    r"required skills?|"
    r"preferred skills?|"
    r"experience required|"
    r"experience and education|"
    r"knowledge, skills|"
    r"competencies?|"
    r"core competencies?|"
    r"what you['’]ll need|"
    r"what you need|"
    r"what we expect|"
    r"what we['’]re looking for|"
    r"who you are|"
    r"what you bring|"
    r"your profile|"
    r"about you|"
    r"ideal candidate|"
    r"candidate profile|"
    r"person specification|"
    r"key attributes|"
    r"traits we['’]re seeking|"
    r"essential criteria|"
    r"selection criteria|"
    r"minimum qualifications?|"
    r"preferred qualifications?|"
    r"desired qualifications?|"
    r"education and experience|"
    r"education requirements?|"
    r"background required"
    r")\b"
)


req_mask = df["job_description"].str.contains(req_pattern, case=False, na=False)

# === Education / Degree / Certification  ===
edu_pattern = (
    r"(?i)\b("
    r"education|educational background|"
    r"degree[s]?|bachelor'?s|master'?s|ph\.?d|doctorate|mba|major|"
    r"college degree|university degree|high school diploma|ged|institution|"
    r"associate'?s degree|advanced degree|graduate degree|undergraduate degree|"
    r"certification[s]?|certified|license|required license|"
    r"credential[s]?|training required|academic background|academic requirements"
    r")\b"
)
edu_mask = df["job_description"].str.contains(edu_pattern, case=False, na=False)




total = len(df)
duties_count = duties_mask.sum()
req_count = req_mask.sum()
edu_count = edu_mask.sum()
both_count = (duties_mask & req_mask).sum()
all_three_count = (duties_mask & req_mask & edu_mask).sum()

print(f"Total rows: {total}")
print(f"Duties-related: {duties_count} ({duties_count/total:.2%})")
print(f"Requirements-related: {req_count} ({req_count/total:.2%})")
print(f"Education-related: {edu_count} ({edu_count/total:.2%})")
print(f"Both (Duties+Requirements): {both_count} ({both_count/total:.2%})")
print(f"All three present: {all_three_count} ({all_three_count/total:.2%})")




  duties_mask = df["job_description"].str.contains(duties_pattern, case=False, na=False)
  req_mask = df["job_description"].str.contains(req_pattern, case=False, na=False)
  edu_mask = df["job_description"].str.contains(edu_pattern, case=False, na=False)


Total rows: 5306
Duties-related: 3658 (68.94%)
Requirements-related: 3896 (73.43%)
Education-related: 3882 (73.16%)
Both (Duties+Requirements): 3001 (56.56%)
All three present: 2549 (48.04%)


# only keep Duties、Requirements、Education

In [73]:
"""
clean_jd.py
-----------
Cleans Job Description (JD) text and detects structural sections.

Functions:
1. Removes HTML tags, whitespace, and formatting noise.
2. Removes EEO statements, legal disclaimers, contact info, and job template sections.
3. Retains key content such as responsibilities, requirements, and skills.
4. Detects presence of key sections (Responsibilities / Requirements / Education).
5. Supports batch cleaning for DataFrames and outputs:
   - `jd_cleaned`
   - section flags: has_duties, has_requirements, has_education
   - coverage statistics summary.
"""

import re
import pandas as pd
from tqdm import tqdm


# === Basic Cleaning ===
def basic_clean(text: str) -> str:
    """Remove HTML tags, normalize whitespace, and trim."""
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r"<[^>]+>", " ", text)   # Remove HTML tags
    text = re.sub(r"\s+", " ", text)       # Collapse multiple spaces
    return text.strip()


# === Remove Noise (legal, EEO, contact info, templates) ===
def remove_noise_sections(text: str) -> str:
    """Remove EEO, legal disclaimers, benefits, and template filler text."""
    noise_patterns = [
        r"(?i)equal opportunity.*",
        r"(?i)affirmative action.*",
        r"(?i)we are an equal.*",
        r"(?i)www\.[^\s]+",
        r"(?i)visit our website.*",
        r"(?i)apply now.*",
        r"(?i)please send.*resume.*",
        r"(?i)drug test.*",
        r"(?i)disability.*",
        r"(?i)insurance.*",
        r"(?i)benefits include.*",
        r"(?i)401k.*",
        r"(?i)bonus.*plan.*",
        r"(?i)salary.*DOE.*",
        r"(?i)click here.*",
        r"(?i)privacy policy.*",
        r"(?i)background check.*",
    ]
    for pat in noise_patterns:
        text = re.sub(pat, " ", text)
    return re.sub(r"\s+", " ", text).strip()


# === Section Detection Patterns ===
DUTIES_PATTERN = (
    r"(?i)\b("
    r"responsibilit(?:y|ies)|duties|tasks|"
    r"key responsibilities|primary responsibilities|main duties|"
    r"essential duties|job duties|core responsibilities|"
    r"what you['’]ll do|what you will do|your role|role overview|"
    r"day[- ]to[- ]day|main responsibilities|key deliverables|"
    r"accountabilities|scope of work|what this role does"
    r")\b"
)

REQ_PATTERN = (
    r"(?i)\b("
    r"requirements?|qualifications?|skills and experience|"
    r"required skills?|preferred skills?|experience required|"
    r"experience and education|knowledge, skills|competencies?|"
    r"core competencies?|what you['’]ll need|what you need|"
    r"what we expect|what we['’]re looking for|who you are|"
    r"what you bring|your profile|about you|ideal candidate|"
    r"candidate profile|person specification|key attributes|"
    r"traits we['’]re seeking|essential criteria|selection criteria|"
    r"minimum qualifications?|preferred qualifications?|desired qualifications?|"
    r"education and experience|education requirements?|background required"
    r")\b"
)

EDU_PATTERN = (
    r"(?i)\b("
    r"education|educational background|degree[s]?|bachelor'?s|master'?s|ph\.?d|doctorate|"
    r"mba|major|college degree|university degree|high school diploma|ged|institution|"
    r"associate'?s degree|advanced degree|graduate degree|undergraduate degree|"
    r"certification[s]?|certified|license|required license|credential[s]?|"
    r"training required|academic background|academic requirements"
    r")\b"
)


# === Extract Relevant Sections (using patterns above) ===
def extract_relevant_sections(text: str) -> str:
    """
    Extract relevant JD sections using extended regex patterns.
    Uses DUTIES_PATTERN, REQ_PATTERN, and EDU_PATTERN.
    If no sections found, returns the entire text.
    """
    sections = []

    # --- Duties / Responsibilities ---
    duties_match = re.search(DUTIES_PATTERN + r"[:\-\n\r]+(.*?)(?=\b[A-Z][a-z]{2,}\b[:\-\n\r]|$)",
                             text, flags=re.DOTALL)
    if duties_match:
        content = re.sub(r"\s+", " ", duties_match.group(1)).strip()
        if len(content) > 50:
            sections.append(f"Duties: {content}")

    # --- Requirements / Qualifications ---
    req_match = re.search(REQ_PATTERN + r"[:\-\n\r]+(.*?)(?=\b[A-Z][a-z]{2,}\b[:\-\n\r]|$)",
                          text, flags=re.DOTALL)
    if req_match:
        content = re.sub(r"\s+", " ", req_match.group(1)).strip()
        if len(content) > 50:
            sections.append(f"Requirements: {content}")

    # --- Education / Degree ---
    edu_match = re.search(EDU_PATTERN + r"[:\-\n\r]+(.*?)(?=\b[A-Z][a-z]{2,}\b[:\-\n\r]|$)",
                          text, flags=re.DOTALL)
    if edu_match:
        content = re.sub(r"\s+", " ", edu_match.group(1)).strip()
        if len(content) > 50:
            sections.append(f"Education: {content}")

    return " ".join(sections) if sections else text


# === Full Cleaning Pipeline for Single JD ===
def clean_jd_text(text: str) -> str:
    """Apply basic, noise removal, and section extraction to one JD."""
    text = basic_clean(text)
    text = remove_noise_sections(text)
    text = extract_relevant_sections(text)
    return text


# === DataFrame-Level Cleaning & Section Tagging ===
def clean_jd_dataframe(df: pd.DataFrame, source_col: str = "job_description") -> pd.DataFrame:
    """
    Clean all job descriptions in a DataFrame and detect JD structure.
    Creates:
        - `jd_cleaned` (cleaned text)
        - `has_duties`, `has_requirements`, `has_education` flags
    Prints structure coverage statistics.
    """
    tqdm.pandas(desc="Cleaning job descriptions")
    df = df.copy()

    # Clean text
    df["jd_cleaned"] = df[source_col].progress_apply(clean_jd_text)

    # Section detection flags
    df["has_duties"] = df[source_col].str.contains(DUTIES_PATTERN, case=False, na=False)
    df["has_requirements"] = df[source_col].str.contains(REQ_PATTERN, case=False, na=False)
    df["has_education"] = df[source_col].str.contains(EDU_PATTERN, case=False, na=False)
    df["has_all_three"] = df["has_duties"] & df["has_requirements"] & df["has_education"]

    # === Summary Statistics ===
    total = len(df)
    duties_count = df["has_duties"].sum()
    req_count = df["has_requirements"].sum()
    edu_count = df["has_education"].sum()
    all_three = df["has_all_three"].sum()

    print(f"\n=== JD Structure Summary ===")
    print(f"Total rows: {total}")
    print(f"Duties-related: {duties_count} ({duties_count/total:.2%})")
    print(f"Requirements-related: {req_count} ({req_count/total:.2%})")
    print(f"Education-related: {edu_count} ({edu_count/total:.2%})")
    print(f"All three present: {all_three} ({all_three/total:.2%})")

    # Optionally filter by text length
    # df = df[df["jd_cleaned"].str.len().between(100, 2000)]

    return df.reset_index(drop=True)


In [74]:
cleaned_df = clean_jd_dataframe(df)

Cleaning job descriptions: 100%|██████████| 5306/5306 [00:03<00:00, 1619.84it/s]
  df["has_duties"] = df[source_col].str.contains(DUTIES_PATTERN, case=False, na=False)
  df["has_requirements"] = df[source_col].str.contains(REQ_PATTERN, case=False, na=False)
  df["has_education"] = df[source_col].str.contains(EDU_PATTERN, case=False, na=False)



=== JD Structure Summary ===
Total rows: 5306
Duties-related: 3658 (68.94%)
Requirements-related: 3896 (73.43%)
Education-related: 3882 (73.16%)
All three present: 2549 (48.04%)


In [79]:
# === Count not-null and null in job_description ===
total_rows = len(cleaned_df)
not_null_count = cleaned_df["job_description"].notna().sum()
null_count = cleaned_df["job_description"].isna().sum()

empty_string_count = (cleaned_df["job_description"].str.strip() == "").sum()
valid_count = total_rows - null_count - empty_string_count

print("=== job_description Completeness Summary ===")
print(f"Total rows: {total_rows}")
print(f"Non-null (not NaN): {not_null_count} ({not_null_count/total_rows:.2%})")
print(f"Null (NaN): {null_count} ({null_count/total_rows:.2%})")
print(f"Empty strings: {empty_string_count} ({empty_string_count/total_rows:.2%})")
print(f"Valid non-empty job_description: {valid_count} ({valid_count/total_rows:.2%})")


=== job_description Completeness Summary ===
Total rows: 5306
Non-null (not NaN): 5306 (100.00%)
Null (NaN): 0 (0.00%)
Empty strings: 0 (0.00%)
Valid non-empty job_description: 5306 (100.00%)
