> Import dependencies for Hand-on 1

In [2]:
import pandas as pd
import string
import requests
from bs4 import BeautifulSoup
import numpy as np

> Read and clean `software_development_usa.csv` dataset

In [3]:
def get_and_clean_data() -> pd.DataFrame:
    data = pd.read_csv("../../data/software_development_usa.csv")
    description = data["job_description"]
    cleaned_description = description.apply(
        lambda s: s.translate(str.maketrans("", "", string.punctuation + "\xa0"))
    )
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(
        lambda s: s.translate(
            str.maketrans(string.whitespace, " " * len(string.whitespace), "")
        )
    )
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

In [4]:
data = get_and_clean_data()
data.head()

0    the chosen sr software developer will be part ...
1    position c lead software developer location mi...
2    senior software developer hoboken nj starts as...
3    our client a multinational publishing and educ...
4    position c lead software developer location ph...
Name: job_description, dtype: object

> Tokenize the descriptions

In [5]:
def simple_tokenize(data: pd.DataFrame) -> pd.DataFrame:
    cleaned_description = data.apply(lambda s: [x.strip() for x in s.split()])
    return cleaned_description

In [6]:
data = simple_tokenize(data)
data.head()

0    [the, chosen, sr, software, developer, will, b...
1    [position, c, lead, software, developer, locat...
2    [senior, software, developer, hoboken, nj, sta...
3    [our, client, a, multinational, publishing, an...
4    [position, c, lead, software, developer, locat...
Name: job_description, dtype: object

> Combine `get_and_clean_data()` and `simple_tokenize()` into `parse_job_description()`

In [7]:
def parse_job_description():
    cleaned_description = get_and_clean_data()
    cleaned_description = simple_tokenize(cleaned_description)
    return cleaned_description

In [10]:
import nltk

nltk.download("stopwords")
nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


def inverse_indexing(data: pd.DataFrame):
    sw_set = set(stopwords.words()) - {"c"}
    no_sw_description = data.apply(lambda x: [w for w in x if w not in sw_set])
    ps = PorterStemmer()
    stemmed_description = no_sw_description.apply(
        lambda x: set([ps.stem(w) for w in x])
    )
    all_unique_term = list(set.union(*stemmed_description.to_list()))

    invert_index = {}
    for s in all_unique_term:
        invert_index[s] = set(stemmed_description.loc[stemmed_description.apply(lambda x: s in x)].index)
    return invert_index

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tkthanatorn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/tkthanatorn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
def search(invert_index, query):
    ps = PorterStemmer()
    process_query = [s.lower() for s in query.split()]
    stemmed = [ps.stem(s) for s in process_query]
    matched = list(set.intersection(*[invert_index[s] for s in stemmed]))
    return matched

In [14]:
data = parse_job_description()
invert_index = inverse_indexing(data)
query = "java oracle"
matched = search(invert_index, query)
print(
    parse_job_description()
    .loc[matched]
    .apply(lambda x: " ".join(x))
    .head()
    .to_markdown()
)

|      | job_description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                