In [1]:
# !pip install beautifulsoup4

In [23]:
import os, re
import pandas as pd
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords

In [3]:
# Define the directory containing the job posting HTML files
job_postings_dir = "./data/job_postings"

# Prepare a list to store the file name and its parsed text
records = []

# Loop over each file in the job_postings directory
for file_name in os.listdir(job_postings_dir):
    if file_name.endswith(".html"):
        file_path = os.path.join(job_postings_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()

            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(html_content, 'html.parser')

            # Extract text from the HTML
            parsed_text = soup.get_text(separator=' ', strip=True)

            # Store results
            records.append({
                "filename": file_name,
                "parsed_text": parsed_text
            })

# Create a pandas DataFrame from the records
df_jobs = pd.DataFrame(records, columns=["filename", "parsed_text"])

# Display the first few rows of the DataFrame
df_jobs.head()


Unnamed: 0,filename,parsed_text
0,fb17842d02292e83.html,"Python Systems Engineer - San Francisco, CA 94..."
1,06e2c7659a3199e9.html,Santa Clara 4-H Community Ed Specialist 3 - Oa...
2,c49bf5a9b76b6943.html,Senior Software Airworthiness Engineer (2019-0...
3,0c569e6055392385.html,"Data Center Technician - Hayward, CA Data Cent..."
4,0a22c5c79af5fcf8.html,"Data Architect - Raleigh, NC 27609 Data Archit..."


In [4]:

tags_counter = Counter()

for file_name in os.listdir(job_postings_dir):
    if file_name.endswith(".html"):
        file_path = os.path.join(job_postings_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Count all tag types
            for tag in soup.find_all():
                tags_counter[tag.name] += 1

# Convert the Counter to a DataFrame to visualize
df_tags = pd.DataFrame(tags_counter.items(), columns=['tag', 'count']).sort_values(by='count', ascending=False)
df_tags.head(10)


Unnamed: 0,tag,count
11,li,25197
5,div,14149
12,p,11627
7,br,10756
9,b,7793
10,ul,5078
4,h2,2067
0,html,1458
2,title,1458
1,head,1458


In [5]:
df_tags.shape

(16, 2)

In [6]:

skill_keywords = ["python", "sql", "machine learning", "data analysis", "statistics", "nlp", 
                  "deep learning", "cloud", "aws", "azure", "gcp", "etl", "excel", "power bi"]

skill_tag_counts = Counter()

for file_name in os.listdir(job_postings_dir):
    if file_name.endswith(".html"):
        file_path = os.path.join(job_postings_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # For each HTML element, check if it contains any skill keywords
            for tag in soup.find_all():
                tag_text = tag.get_text(separator=' ', strip=True).lower()
                if any(skill in tag_text for skill in skill_keywords):
                    skill_tag_counts[tag.name] += 1

df_skill_tags = pd.DataFrame(skill_tag_counts.items(), columns=['tag', 'count']).sort_values(by='count', ascending=False)
df_skill_tags

Unnamed: 0,tag,count
7,li,4892
5,div,2957
6,ul,2078
0,html,1346
3,body,1346
9,p,998
4,h2,149
1,head,128
2,title,128
8,b,106


In [22]:
tags_of_interest = ["li", "div", "ul", "p", "h2", "head", "title", "b"]

# Initialize columns (set them to 0 for starters)
for t in tags_of_interest:
    col_name = f"{t}_skill_count"
    df_jobs[col_name] = 0

# Also track total skill mentions in entire doc
df_jobs["total_skill_count"] = 0

# Directory with HTML job postings
job_postings_dir = "./data/job_postings"

def count_skills_in_text(text, skill_list):
    """
    Given a piece of text, return how many skill keywords appear.
    We count each skill presence once, even if it appears multiple times.
    If you want to count duplicates, you'd do something more thorough.
    """
    text_lower = text.lower()
    count = 0
    for skill in skill_list:
        if skill in text_lower:
            count += 1
    return count

# Iterate through df_jobs
for idx, row in df_jobs.iterrows():
    filename = row["filename"]
    file_path = os.path.join(job_postings_dir, filename)
    
    # Open and parse the HTML
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        soup = BeautifulSoup(html_content, "html.parser")
    except FileNotFoundError:
        # If the file doesn't exist or there's an error, skip
        continue
    
    # Track a total skill count across entire doc
    total_skills_in_doc = count_skills_in_text(soup.get_text(separator=' ', strip=True), skill_keywords)
    df_jobs.loc[idx, "total_skill_count"] = total_skills_in_doc
    
    # Now, for each tag_of_interest, find all tags and sum skill counts
    for t in tags_of_interest:
        skill_count_for_tag = 0
        
        # find_all(...) gets all occurrences of that tag
        tags_found = soup.find_all(t)
        
        # Sum up how many skill keywords are present in each tag's text
        for tag_element in tags_found:
            tag_text = tag_element.get_text(separator=' ', strip=True)
            # Either increment by 1 for each skill found
            # or do a cumulative sum if you want to count duplicates
            skill_count_for_tag += count_skills_in_text(tag_text, skill_keywords)
        
        col_name = f"{t}_skill_count"
        df_jobs.loc[idx, col_name] = skill_count_for_tag


tag_columns = [f"{t}_skill_count" for t in tags_of_interest]
corr_matrix = df_jobs[tag_columns + ["total_skill_count"]].corr()
corr_matrix

Unnamed: 0,li_skill_count,div_skill_count,ul_skill_count,p_skill_count,h2_skill_count,head_skill_count,title_skill_count,b_skill_count,total_skill_count
li_skill_count,1.0,0.214475,0.831619,0.067449,0.100034,0.129614,0.129614,0.036548,0.76351
div_skill_count,0.214475,1.0,0.299251,-0.009092,0.024719,0.04501,0.04501,-0.025377,0.270085
ul_skill_count,0.831619,0.299251,1.0,0.121295,0.073415,0.093643,0.093643,0.057626,0.654729
p_skill_count,0.067449,-0.009092,0.121295,1.0,0.098797,0.11969,0.11969,0.237086,0.244742
h2_skill_count,0.100034,0.024719,0.073415,0.098797,1.0,0.941607,0.941607,0.14075,0.113611
head_skill_count,0.129614,0.04501,0.093643,0.11969,0.941607,1.0,1.0,0.078656,0.134332
title_skill_count,0.129614,0.04501,0.093643,0.11969,0.941607,1.0,1.0,0.078656,0.134332
b_skill_count,0.036548,-0.025377,0.057626,0.237086,0.14075,0.078656,0.078656,1.0,0.045698
total_skill_count,0.76351,0.270085,0.654729,0.244742,0.113611,0.134332,0.134332,0.045698,1.0


In [7]:
# Step 1: Read resume.txt
with open("./data/resume.txt", "r", encoding="utf-8") as f:
    resume_text = f.read()
    
print("Resume loaded. Characters in resume:", len(resume_text))

# Define potential skill keywords (you can expand this list)
potential_skills = [
    "python", "r", "sql", "java", "c++", "machine learning", "ml", 
    "deep learning", "nlp", "ai", "tensorflow", "pytorch", 
    "data analysis", "etl", "spark", "hadoop", "excel", 
    "tableau", "power bi", "aws", "azure", "gcp", "matplotlib", "numpy", 
    "pandas", "scikit-learn", "pyspark", "dbscan", "plotly", "gcp", 
    "scipy", "pandas", "k-means", "geonamescache", "basemap", "pca"
    "svd", "nlp", "clustering", "algorithms", "statistic", "statistical",
    "analysis", "visualization", 
]

# Convert resume text to lower case for matching
resume_lower = resume_text.lower()

# Determine which skill keywords actually appear in the resume
resume_skills = set()
for skill in potential_skills:
    if skill in resume_lower:
        resume_skills.add(skill)

print("Skills identified in resume:")
for skill in sorted(resume_skills):
    print("-", skill)

# 3. COUNT HOW MANY RESUME SKILLS ARE MENTIONED IN EACH JOB POST
def count_resume_skill_matches(posting_text, resume_skills):
    text_lower = posting_text.lower()
    return sum(skill in text_lower for skill in resume_skills)

df_jobs["skill_count"] = df_jobs["parsed_text"].apply(
    lambda text: count_resume_skill_matches(text, resume_skills)
)

# 4. DEFINE THRESHOLDS AND CREATE A FLAG
min_skill_matches = 3  # too few => possibly underqualified
max_skill_matches = 8  # too many => possibly overqualified
df_jobs["is_in_sweet_spot"] = (
    (df_jobs["skill_count"] >= min_skill_matches) & 
    (df_jobs["skill_count"] <= max_skill_matches)
)

Resume loaded. Characters in resume: 683
Skills identified in resume:
- algorithms
- analysis
- basemap
- clustering
- dbscan
- geonamescache
- k-means
- matplotlib
- nlp
- numpy
- pandas
- r
- scikit-learn
- scipy
- statistic
- statistical
- visualization


In [9]:
df_jobs.sample(5)

Unnamed: 0,filename,parsed_text,skill_count,is_in_sweet_spot
1039,af6e9e4d7d155600.html,"Junior Data Scientist - New York, NY Junior Da...",8,True
1406,2cb5a41aafebf8c5.html,"Business Analyst, Business Planning and Operat...",4,True
1375,a559b6630c13783d.html,"Junior Data Scientist - College Park, MD 20740...",5,True
1118,d4741f73485ccdb3.html,"Biological Data Scientist - San Francisco, CA ...",4,True
137,ea8ca858cb88cf60.html,Siri Language Engineer - Hebrew - Santa Clara ...,4,True


In [10]:
df_jobs.is_in_sweet_spot.value_counts()

is_in_sweet_spot
True     997
False    461
Name: count, dtype: int64

In [17]:
df_jobs.loc[df_jobs['is_in_sweet_spot'] == True, ['parsed_text', 'is_in_sweet_spot']].sample(5)

Unnamed: 0,parsed_text,is_in_sweet_spot
1428,"Clinical Data Scientist - San Francisco, CA Cl...",True
868,"Senior Data Scientist - Pasadena, CA Senior Da...",True
809,"Data Visualization Intern - Chicago, IL 60601 ...",True
1077,PCA Quality Assurance Reviewer #776 - Lawrence...,True
503,"Audit Manager - Dallas, TX 75228 Audit Manager...",True


In [18]:
df_jobs.loc[df_jobs['is_in_sweet_spot'] == False, ['parsed_text', 'is_in_sweet_spot']].sample(5)

Unnamed: 0,parsed_text,is_in_sweet_spot
1224,Sr. A&P Technician Customer Support USA - Unit...,False
1299,Senior Learning Instructional Designer - San F...,False
321,"Senior Data Scientist - San Francisco, CA 9410...",False
1161,"Nursing Assist,ICU/CCU,15117,36 hrs-7a-7:30p,E...",False
1069,"Physics Data Scientist - Foothill Ranch, CA Ph...",False


In [25]:
# 1. Define your stop words, excluding 'r' if you specifically
#    want to allow "R" as a legitimate skill mention:
stop_words = set(stopwords.words('english'))
# But we do *not* want to exclude "r" used as the letter for the R language:
if "r" in stop_words:
    stop_words.remove("r")  # now 'r' won't be filtered out

# 2. A helper function to tokenize and remove punctuation/stopwords
def tokenize_and_clean(text):
    """
    1. Lowercase
    2. Remove punctuation
    3. Split on non-word chars
    4. Remove English stopwords (except 'r')
    5. Return a list of clean tokens
    """
    # Lowercase
    text = text.lower()
    
    # Split on non-alphanumeric characters
    tokens = re.split(r"\W+", text)
    
    # Remove any empty strings from the split
    tokens = [t for t in tokens if t]
    
    # Filter out standard stop words (but we've already excluded 'r')
    # Also exclude single-letter tokens *other* than 'r', if you want:
    clean_tokens = []
    for tok in tokens:
        # Example: If you also want to preserve single-letter 'c' or 'c++' references, adjust logic
        # For now, we'll let them pass unless they're in stopwords
        if tok in stop_words:
            continue
        clean_tokens.append(tok)
    
    return clean_tokens

# 3. Make sure you have your df_jobs with columns: ["filename", ...]
#    We'll create a new column, "li_tokens", which will be a list of tokens found in <li> blocks.

# Initialize/clear the column first (optional):
df_jobs["li_tokens"] = [[] for _ in range(len(df_jobs))]

job_postings_dir = "./data/job_postings"

# 4. Iterate through each row, parse <li> content, tokenize, store results
for idx, row in df_jobs.iterrows():
    filename = row["filename"]
    file_path = os.path.join(job_postings_dir, filename)
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
    except FileNotFoundError:
        # If the file is missing, skip or handle as needed
        continue
    
    # Parse HTML
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Find all <li> tags in the doc
    li_elements = soup.find_all("li")
    
    # For each <li> block, extract text, tokenize, and accumulate
    all_li_tokens = []
    for li in li_elements:
        li_text = li.get_text(separator=" ", strip=True)
        tokens = tokenize_and_clean(li_text)
        all_li_tokens.extend(tokens)
    
    # Store the aggregated tokens in df_jobs
    df_jobs.at[idx, "li_tokens"] = all_li_tokens

# Now df_jobs["li_tokens"] is a list of strings (tokens) extracted from all <li> tags.
df_jobs.sample(15).T

Unnamed: 0,828,1411,528,390,111,227,1302,450,644,1343,479,640,886,1241,413
filename,7ebdbbca83161bb3.html,a6ec27672ae68532.html,632ac322a23b9e0b.html,f311ba12e14134d2.html,b0cadc08400aa1c7.html,84f76ff2786d57c2.html,6ccb22e92f4c5c8a.html,6acf54368233122a.html,d57c2ecf28d37921.html,f09832212857f895.html,f4fbbee62edb5453.html,39e55e5c319077fb.html,259ffd5289fca87d.html,1429f3ee5730607a.html,6d936b7369dfe0b4.html
parsed_text,"Senior Data Analyst - Santa Cruz, CA 95060 Sen...",Data Analytics Teaching Assistant - UC Berkele...,Data Sciences & AI Graduate Programme - US - W...,Architectural Engineering Associate San Franci...,"QSR Store Assistant - Pleasanton, TX QSR Store...","Data Scientist - Columbus, OH Data Scientist -...","Diagnostic Technician 1 - San Jose, CA 95119 D...","Actuarial Analyst II - Oakland, CA Actuarial A...","Data Scientist - Green Bay, WI 54313 Data Scie...",Natural Language Processing Scientist - San Jo...,Software Developer – Machine-learning/Artifici...,"Intern - Clinical Toxicology - Salt Lake City,...","Support Engineer - San Francisco, CA 94104 Sup...","Data Engineer - San Francisco, CA Data Enginee...",Data Analytics Teaching Assistant - University...
skill_count,5,3,3,1,2,4,3,4,7,5,6,4,3,4,3
is_in_sweet_spot,True,True,True,False,False,True,True,True,True,True,True,True,True,True,True
li_skill_count,1,3,6,0,0,5,0,4,7,12,5,0,3,7,3
p_skill_count,0,0,4,0,0,0,0,0,0,0,0,2,0,0,0
strong_skill_count,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
b_skill_count,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
span_skill_count,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
div_skill_count,3,0,0,0,0,7,0,3,8,27,0,12,0,0,0
