In [1]:
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

import pandas as pd
import requests
import time
import json
from datetime import datetime

from bs4 import BeautifulSoup
from markdownify import markdownify as md

from langchain.callbacks import get_openai_callback

OPENAI_API_KEY = "..."
#os.environ["OPENAI_API_KEY"] = "..." # remember to delete this before committing on Github

In [2]:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2000,
    openai_api_key=OPENAI_API_KEY
)

In [11]:
def pull_data_from_url(token):
    """
    Pull data about a job posting on Greenhouse using token
    :param token: a keyword of the company you're searching for
    :return:
    """
    # url to pull job data from
    url = f'https://boards-api.greenhouse.io/v1/boards/{token}/jobs?content=true'

    try:
        response = requests.get(url)
    except: # handling exception
        print ("Cannot request URL! Try again!")
        return

    status_code = response.status_code
    try:
        jobs = response.json()['jobs']
    except KeyError:
        print(f"Cannot find urls with keyword \"{token}\" given! Try again.")
        return
    print (f"{token}: {status_code}, Found {len(jobs)} jobs")

    return jobs

In [30]:
jobs = pull_data_from_url("okta")
print("Preview of the first job:") # checking out the first position
print (json.dumps(jobs[0]))

okta: 200, Found 149 jobs
Preview of the first job:
{"absolute_url": "https://www.okta.com/company/careers/opportunity/4977199?gh_jid=4977199", "data_compliance": [{"type": "gdpr", "requires_consent": false, "requires_processing_consent": false, "requires_retention_consent": false, "retention_period": null}], "internal_job_id": 2518868, "location": {"name": "Melbourne "}, "metadata": null, "id": 4977199, "updated_at": "2023-05-12T11:53:14-04:00", "requisition_id": "P17320_2518868", "title": "Account Executive, ISV & DNB Sales", "content": "&lt;div class=&quot;content-intro&quot;&gt;&lt;p&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&lt;strong&gt;Get to know Okta&lt;/strong&gt;&lt;/span&gt;&lt;/p&gt;\n&lt;p&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&lt;br&gt;&lt;/span&gt;Okta is The World\u2019s Identity Company. We free everyone to safely use any technology\u2014anywhere, on any device or app. Our Workforce and Customer Identity Clouds enable secure yet flexible access, auth

In [37]:
def get_job_id(job):
    """ get the job id from the json object """
    return job.get("absolute_url").split("/")[-1]

def get_job_description(job):
    """ display some important information about the job and return the content of the job as text"""
    print(f"Job ID: {job_description['id']}")
    print(f"Link: {job_description['absolute_url']}")
    print(f"Updated At: {datetime.fromisoformat(job_description['updated_at']).strftime('%B %-d, %Y')}")
    print(f"Title: {job_description['title']}\n")
    content_text = md(BeautifulSoup(job_description['content'], 'html.parser').get_text()) # we use Beautiful Soup to extract convert html to text
    print(f"Content:\n{content_text}")
    return content_text
# testing an example
job_description = jobs[0]
job_id = get_job_id(job_description)
text = get_job_description(job_description)

Job ID: 4977199
Link: https://www.okta.com/company/careers/opportunity/4977199?gh_jid=4977199
Updated At: May 12, 2023
Title: Account Executive, ISV & DNB Sales

Content:
**Get to know Okta**


  
Okta is The World’s Identity Company. We free everyone to safely use any technology—anywhere, on any device or app. Our Workforce and Customer Identity Clouds enable secure yet flexible access, authentication, and automation that transforms how people move through the digital world, putting Identity at the heart of business security and growth.   
  
At Okta, we celebrate a variety of perspectives and experiences. We are not looking for someone who checks every single box, we’re looking for lifelong learners and people who can make us better with their unique experiences.   
  
Join our team! We’re building a world where Identity belongs to you.

**Job Purpose**


Many of the world’s largest digital businesses rely on Okta every day to power customer interactions in a world where every compan

Next we use Kor to extract data into a json object

In [34]:
tools = Object(
    id="skills",
    description="""
        A tool, framework, or programming language that the job listed as requirements.
    """,
    attributes=[
        Text(
            id="skill",
            description="The name of a tool, framework, or programming language"
        )
    ],
    examples=[
        (
            "Experience in working with Pytorch, or Python, Apache Spark is a plus.",
            [
                {"skill": "Pytorch"},
                {"skill": "Python"},
                {"skill": "Apache Spark"},
            ],
        ),
        (
           "Experience with Microsoft Excel",
            [
               {"skill": "Microsoft Excel"}
            ]
        ),
        (
           "You must have at least 3 years of experience with AWS",
            [
               {"skill": "AWS"}
            ]
        ),
    ],
    many=True,
)

In [35]:
chain = create_extraction_chain(llm, tools, input_formatter="triple_quotes")

In [39]:
output = chain.predict_and_parse(text=text)["data"]
print(json.dumps(output,sort_keys=True))

{"skills": [{"skill": "app development"}, {"skill": "cyber security"}, {"skill": "digital growth strategies"}, {"skill": "enterprise cloud software"}, {"skill": "infrastructure management"}, {"skill": "application development technologies"}, {"skill": "dev-ops technologies"}, {"skill": "digital experience platforms"}, {"skill": "customer data platforms"}, {"skill": "CMS"}, {"skill": "mart-tech solution sets"}, {"skill": "analytics toolings"}, {"skill": "MEDDICC qualification framework"}, {"skill": "Challenger methodology"}, {"skill": "target account selling"}, {"skill": "solution selling"}, {"skill": "consultative sales techniques"}, {"skill": "partnerships"}, {"skill": "hyperscale cloud providers"}, {"skill": "AWS"}]}
