In [4]:
# !pip install beautifulsoup4

In [5]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [6]:
# Define the directory containing the job posting HTML files
job_postings_dir = "./data/job_postings"

# Prepare a list to store the file name and its parsed text
records = []

# Loop over each file in the job_postings directory
for file_name in os.listdir(job_postings_dir):
    if file_name.endswith(".html"):
        file_path = os.path.join(job_postings_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()

            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(html_content, 'html.parser')

            # Extract text from the HTML
            parsed_text = soup.get_text(separator=' ', strip=True)

            # Store results
            records.append({
                "filename": file_name,
                "parsed_text": parsed_text
            })

# Create a pandas DataFrame from the records
df_jobs = pd.DataFrame(records, columns=["filename", "parsed_text"])

# Display the first few rows of the DataFrame
df_jobs.head()


Unnamed: 0,filename,parsed_text
0,fb17842d02292e83.html,"Python Systems Engineer - San Francisco, CA 94..."
1,06e2c7659a3199e9.html,Santa Clara 4-H Community Ed Specialist 3 - Oa...
2,c49bf5a9b76b6943.html,Senior Software Airworthiness Engineer (2019-0...
3,0c569e6055392385.html,"Data Center Technician - Hayward, CA Data Cent..."
4,0a22c5c79af5fcf8.html,"Data Architect - Raleigh, NC 27609 Data Archit..."


In [7]:
from collections import Counter

tags_counter = Counter()

for file_name in os.listdir(job_postings_dir):
    if file_name.endswith(".html"):
        file_path = os.path.join(job_postings_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Count all tag types
            for tag in soup.find_all():
                tags_counter[tag.name] += 1

# Convert the Counter to a DataFrame to visualize
df_tags = pd.DataFrame(tags_counter.items(), columns=['tag', 'count']).sort_values(by='count', ascending=False)
df_tags.head(10)


Unnamed: 0,tag,count
11,li,25197
5,div,14149
12,p,11627
7,br,10756
9,b,7793
10,ul,5078
4,h2,2067
0,html,1458
2,title,1458
1,head,1458


In [9]:
import re

skill_keywords = ["python", "sql", "machine learning", "data analysis", "statistics", "nlp", 
                  "deep learning", "cloud", "aws", "azure", "gcp", "etl", "excel", "power bi"]

skill_tag_counts = Counter()

for file_name in os.listdir(job_postings_dir):
    if file_name.endswith(".html"):
        file_path = os.path.join(job_postings_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # For each HTML element, check if it contains any skill keywords
            for tag in soup.find_all():
                tag_text = tag.get_text(separator=' ', strip=True).lower()
                if any(skill in tag_text for skill in skill_keywords):
                    skill_tag_counts[tag.name] += 1

df_skill_tags = pd.DataFrame(skill_tag_counts.items(), columns=['tag', 'count']).sort_values(by='count', ascending=False)
df_skill_tags

Unnamed: 0,tag,count
7,li,4892
5,div,2957
6,ul,2078
0,html,1346
3,body,1346
9,p,998
4,h2,149
1,head,128
2,title,128
8,b,106
