In [1]:
# NOTEBOOK OVERVIEW
# Obtains required project data. Identify then extract relevant
# HTML sections of the job postings for further analysis.


In [2]:
from pathlib import Path
import bs4

%matplotlib inline
import numpy
import pandas
from pandas import DataFrame


In [3]:
# load HTML, store relevant
# HTML sections in pandas data frame

def parse_sections(html):
    soup = bs4.BeautifulSoup(html, 'lxml')
    title = soup.title.text
    body = soup.body.text
    raw_bullets = soup.find_all('li')
    bullets = [bullet.text.strip() for bullet in raw_bullets]
    return (title, body, bullets)

cleaned_pages = []
for path in Path('data/html_job_postings/').glob('*'):
    file = open(path)
    raw_page = file.read()
    row = parse_sections(raw_page)
    cleaned_pages.append(row)
    file.close()

jobs_df = DataFrame(cleaned_pages, columns=['title', 'body', 'bullets'])
jobs_df.head()


Unnamed: 0,title,body,bullets
0,Staff Software Engineer - Security - Redwood C...,Staff Software Engineer - Security - Redwood C...,[Design and develop platform security solution...
1,"Data Analyst, Customer Support - Boston, MA 02210","Data Analyst, Customer Support - Boston, MA 02...",[Work with the Customer Support teams to unify...
2,"SQL Database Administrator - San Francisco, CA...","SQL Database Administrator - San Francisco, CA...",[Monitors capacity and performance for product...
3,Senior Data Scientist - Risk Mitigation - Jers...,Senior Data Scientist - Risk Mitigation - Jers...,"[3+ years of experience crafting, developing, ..."
4,"Personal Care Assistant - Yuba City, CA","Personal Care Assistant - Yuba City, CA\nKNOWL...",[Actively participates in the resident's plan ...


In [4]:
# - remove duplicates 
# - filter for data science jobs given presence of non data science roles 
#   in the data set: analyst, database admin, etc.

print(jobs_df.shape)

jobs_df = jobs_df[
    jobs_df['title'].str.contains(r'data scien(?:ce|tist)',
    case=False)].copy()

print(jobs_df.shape)

jobs_df['bullets'] = jobs_df['bullets'].apply(tuple, 1)
jobs_df.drop_duplicates(inplace=True)
print(jobs_df.shape)

(1337, 3)
(497, 3)
(492, 3)


In [5]:

jobs_df['title'].head(20)

3     Senior Data Scientist - Risk Mitigation - Jers...
9     Director, Data Scientist, Natural Language Pro...
10      eCommerce Senior Data Scientist - United States
11    Associate Scientist, Data Science - Irving, TX...
12                         Data Scientist - Seattle, WA
16       Data Science Specialist - El Segundo, CA 90245
18                         Data Scientist - Seattle, WA
19                       Senior Data Scientist - Remote
21    Data Scientist, Regulation, Evaluation, and Go...
23    Data Scientist Manager - Hiring in Burbank! - ...
24               Data Scientist - Los Angeles, CA 90017
25         2020 Intern - Data Scientist - United States
29                   Data Scientist - Seattle, WA 98101
32    Data Science & Tagging Analyst - Bethesda, MD ...
45                   Data Scientist - San Francisco, CA
48    Data Scientist - Global Research & Analytic De...
49    Data Scientist and Visualization Specialist - ...
55    Data Scientist (Financial Services) - San 

In [6]:
# save data frame for subsequent notebooks

jobs_df.to_pickle('data/jobs_df.pkl', compression='zip')