> Import and Preprocessing dataset

In [1]:
import pandas as pd
import string
import requests
from bs4 import BeautifulSoup
import numpy as np

In [7]:
def get_and_clean_data() -> pd.DataFrame:
    data = pd.read_csv("../../data/software_development_usa.csv")
    description = data["job_description"]
    cleaned_description = description.apply(
        lambda s: s.translate(str.maketrans("", "", string.punctuation + "\xa0"))
    )
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(
        lambda s: s.translate(
            str.maketrans(string.whitespace, " " * len(string.whitespace), "")
        )
    )
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

In [8]:
def simple_tokenize(data: pd.DataFrame) -> pd.DataFrame:
    cleaned_description = data.apply(lambda s: [x.strip() for x in s.split()])
    return cleaned_description

In [9]:
def parse_job_description():
    cleaned_description = get_and_clean_data()
    cleaned_description = simple_tokenize(cleaned_description)
    return cleaned_description

In [10]:
def parse_db() -> list[list[str]]:
    html_doc = requests.get("https://db-engines.com/en/ranking").content
    soup = BeautifulSoup(html_doc, "html.parser")
    db_table = soup.find("table", {"class": "dbi"})
    all_db = [
        "".join(s.find("a").findAll(text=True, recursive=True)).strip()
        for s in db_table.findAll("th", {"class": "pad-l"})
    ]
    all_db = list(dict.fromkeys(all_db))
    db_list = all_db[:10]
    db_list = [s.lower() for s in db_list]
    db_list = [[[x.strip() for x in s.split()][0]] for s in db_list]
    return db_list

In [11]:
cleaned_db = parse_db()
parsed_description = parse_job_description()

  "".join(s.find("a").findAll(text=True, recursive=True)).strip()


##### What DB should I learn after Java?
Oracle is the database that you should learn after Java.

In [12]:
with_oracle: list[pd.DataFrame] = [None] * len(cleaned_db)
for i, db in enumerate(cleaned_db):
    with_oracle[i] = parsed_description.apply(
        lambda s: np.all([x in s for x in db]) and "java" in s
    ).sum()

    print(
        " ".join(db)
        + " + java: "
        + str(with_oracle[i])
        + " of "
        + str(parsed_description.shape[0])
    )

oracle + java: 913 of 7583
mysql + java: 397 of 7583
microsoft + java: 448 of 7583
postgresql + java: 161 of 7583
mongodb + java: 166 of 7583
redis + java: 40 of 7583
elasticsearch + java: 112 of 7583
ibm + java: 135 of 7583
sqlite + java: 5 of 7583
microsoft + java: 448 of 7583


##### Which DB is in demand alongside oracle?
MySQL is the DB that you should learn alongside oracle.

In [13]:
with_oracle: list[pd.DataFrame] = [None] * len(cleaned_db)
for i, db in enumerate(cleaned_db):
    if db[0] == "oracle":
        continue

    with_oracle[i] = parsed_description.apply(
        lambda s: np.all([x in s for x in db]) and "oracle" in s
    ).sum()

    print(
        " ".join(db)
        + " + oracle: "
        + str(with_oracle[i])
        + " of "
        + str(parsed_description.shape[0])
    )

mysql + oracle: 312 of 7583
microsoft + oracle: 282 of 7583
postgresql + oracle: 100 of 7583
mongodb + oracle: 104 of 7583
redis + oracle: 12 of 7583
elasticsearch + oracle: 32 of 7583
ibm + oracle: 84 of 7583
sqlite + oracle: 17 of 7583
microsoft + oracle: 282 of 7583


#### What programming language is in demand alongside python?
Java is the programming language is in demand alongside python.

In [14]:
langs = [
    ["java"],
    ["python"],
    ["c"],
    ["kotlin"],
    ["swift"],
    ["rust"],
    ["ruby"],
    ["scala"],
    ["julia"],
    ["lua"],
]

with_python: list[pd.DataFrame] = [None] * len(langs)
for i, db in enumerate(langs):
    if db[0] == "python":
        continue

    with_oracle[i] = parsed_description.apply(
        lambda s: np.all([x in s for x in db]) and "python" in s
    ).sum()

    print(
        " ".join(db)
        + " + python: "
        + str(with_oracle[i])
        + " of "
        + str(parsed_description.shape[0])
    )

java + python: 830 of 7583
c + python: 689 of 7583
kotlin + python: 6 of 7583
swift + python: 37 of 7583
rust + python: 6 of 7583
ruby + python: 181 of 7583
scala + python: 76 of 7583
julia + python: 1 of 7583
lua + python: 11 of 7583
