In [129]:
import csv
import re

import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

## Set vars

In [130]:
p = Path("html_job_postings")
idx = range(len([i for i in p.iterdir()]))
cols = ["SourceFile", "Title", "Body", "ListItems"]
df = pd.DataFrame(index=idx, columns=cols)

## Parse HTML files and load into dataframe

In [131]:
def parse_files(p, df):
    for idx, html in enumerate(p.iterdir()):
        with open(html, "r") as f:
            soup = BeautifulSoup(f, "html.parser")
            df.loc[idx, "SourceFile"] = html.name
            df.loc[idx, "Title"] = soup.title.contents[0]
            df.loc[idx, "Body"] = soup.body.get_text()
            df.loc[idx, "ListItems"] = [li.get_text() for li in soup("li")]
    return df

parse_files(p, df)
df

Unnamed: 0,SourceFile,Title,Body,ListItems
0,1e92960a19ffdd34_fccid.html,"Quantitative Analyst - Boston, MA 02116","Quantitative Analyst - Boston, MA 02116\nQuant...",[]
1,3157fcef3ee474da_fccid.html,"Data Scientist - Mountain View, CA","Data Scientist - Mountain View, CA\nGroundTrut...","[\nHelp senior members of the team to explore,..."
2,b423ca22a6e2c10f_fccid.html,"Data Scientist - Seattle, WA","Data Scientist - Seattle, WA\nA Bachelor or Ma...",[A Bachelor or Masters Degree in a highly quan...
3,ea487254a487beb5_fccid.html,Senior Natural Language Processing (NLP) Engin...,Senior Natural Language Processing (NLP) Engin...,[Join a small team creating a proprietary NLU ...
4,cb8a5bce330854e9_fccid.html,"FLEXO FOLDER GLUER OPER - McClellan, CA - McCl...","FLEXO FOLDER GLUER OPER - McClellan, CA - McCl...",[]
...,...,...,...,...
1332,af6a31da7a74b6ac_fccid.html,"Data Scientist - Birmingham, AL 35233","Data Scientist - Birmingham, AL 35233\nSUMMARY...",[]
1333,ad12f415dc3e5e8b_fccid.html,"Senior Data Scientist - Tempe, AZ","Senior Data Scientist - Tempe, AZ\nCircle K is...",[Independently develop advanced analytics and ...
1334,5141331158ad3ba2_fccid.html,New College Grad - Cybersecurity (Masters Degr...,New College Grad - Cybersecurity (Masters Degr...,"[\nAdvanced Degree (e.g. Masters, MBA, JD, MD,..."
1335,abc816859ac7d358_fccid.html,"Decision Science Manager, Media Mix Modeling -...","Decision Science Manager, Media Mix Modeling -...",[\nAs a Decision Science Manager for media mix...


## Remove duplicates

In [132]:
rows_before = df.index.size
df.drop_duplicates("Body", inplace=True)
rows_after = df.index.size
print(f"Duplicate rows removed: {rows_before - rows_after:,}")
print(f"Remaining rows: {rows_after:,}")

Duplicate rows removed: 9
Remaining rows: 1,328



## Filter to only data science jobs

In [133]:
# with pd.option_context('display.max_colwidth', 100, 'display.max_rows', 10000):
#     display(df.loc[:, "Title"])

search_terms = [
    "data science",
    "data scientist",
    "natual language processing",
    "nlp",
    "decision science",
    "statistician",
    "deep learning",
    "neural network",
    "statistical",
    "statistics",
    "data analyst",
    "data analysis",
    "machine learning",
    "predictive modeling",
]

rows_before = df.index.size
df = df[df.Body.str.contains("|".join(search_terms))]
rows_after = df.index.size
print(f"Non-data science rows removed: {rows_before - rows_after:,}")
print(f"Remaining rows: {rows_after:,}")

Non-data science rows removed: 264
Remaining rows: 1,064


## Save df to disk

In [136]:
df.to_csv("data_science_job_postings.csv", columns=df.columns)