<span style="font-size: 120%">Obtain required project data. Identify then extract, transform, load the relevant
HTML sections of the job postings for further analysis.</span>


In [1]:
import bs4
import pandas  as pd
import pathlib as pl


In [2]:
# load HTML, store relevant HTML sections in pandas data frame

def parse_sections(html):
    soup = bs4.BeautifulSoup(html, 'lxml')
    title = soup.title.text.strip()
    body = soup.body.text.strip()
    li_elems = [ li.text.strip() for li in soup.find_all('li') ]
    return (title, body, li_elems)

parsed_pages = []
for path in pl.Path('data/html_job_postings/').glob('*'):
    file = open(path)
    raw_page = file.read()
    row = parse_sections(raw_page)
    parsed_pages.append(row)
    file.close()

jobs_df = pd.DataFrame(parsed_pages, columns=['title', 'body', 'li_elems'])
jobs_df

Unnamed: 0,title,body,li_elems
0,Staff Software Engineer - Security - Redwood C...,Staff Software Engineer - Security - Redwood C...,[Design and develop platform security solution...
1,"Data Analyst, Customer Support - Boston, MA 02210","Data Analyst, Customer Support - Boston, MA 02...",[Work with the Customer Support teams to unify...
2,"SQL Database Administrator - San Francisco, CA...","SQL Database Administrator - San Francisco, CA...",[Monitors capacity and performance for product...
3,Senior Data Scientist - Risk Mitigation - Jers...,Senior Data Scientist - Risk Mitigation - Jers...,"[3+ years of experience crafting, developing, ..."
4,"Personal Care Assistant - Yuba City, CA","Personal Care Assistant - Yuba City, CA\nKNOWL...",[Actively participates in the resident's plan ...
...,...,...,...
1332,"Data Scientist, Models and Algorithms - San Fr...","Data Scientist, Models and Algorithms - San Fr...",[Risk and Fraud - Gusto processes >$10B of pay...
1333,"Data Scientist - New York, NY","Data Scientist - New York, NY\nDS/ML stack:\nL...","[Languages: Python, PySpark, SQL, Data Tools: ..."
1334,Business Planning and Operations Associate - S...,Business Planning and Operations Associate - S...,[Use qualitative and quantitative inputs to pa...
1335,Capital Work Readiness Analyst (Career) - San ...,Capital Work Readiness Analyst (Career) - San ...,[Communicates with peers and management inside...


In [3]:
# data cleaning (non-exhaustive)
# - remove duplicates (trivially)
# - filter for data science jobs given presence of non data science roles 
#   in the dataset: analyst, database admin, etc.

# additionally, more extensive cleaning would involve filtering for li elements 
# containing data about job benefits and other things not related to job requirements

print(jobs_df.shape)

jobs_df = (jobs_df
    .query('title.str.contains(r"data scien(?:ce|tist)", case=False)')
    .copy())
print(jobs_df.shape)

jobs_df['li_elems'] = jobs_df['li_elems'].apply(tuple)
jobs_df = jobs_df.drop_duplicates()
print(jobs_df.shape)

(1337, 3)
(497, 3)
(492, 3)


In [4]:
jobs_df['title']

3       Senior Data Scientist - Risk Mitigation - Jers...
9       Director, Data Scientist, Natural Language Pro...
10        eCommerce Senior Data Scientist - United States
11      Associate Scientist, Data Science - Irving, TX...
12                           Data Scientist - Seattle, WA
                              ...                        
1329    Data Science Intern (BS / MS) - Intern - Sprin...
1330              PhD Data Science Intern - San Diego, CA
1332    Data Scientist, Models and Algorithms - San Fr...
1333                        Data Scientist - New York, NY
1336         SENIOR DATA SCIENTIST - Burlington, VT 05401
Name: title, Length: 492, dtype: object

In [5]:
# save data frame for subsequent notebooks
jobs_df.to_pickle('data/jobs_df.pkl', compression='zip')