In [2]:
import os
from dotenv import load_dotenv

# Load variables from .env file
load_dotenv()

# Access the variables
APIFY_TOKEN = os.getenv("APIFY_TOKEN")

## Live running the Apify, could simiply request the output instead

In [None]:
from apify_client import ApifyClient

# Initialize the ApifyClient with your API token
client = ApifyClient(APIFY_TOKEN)

# Prepare the Actor input
run_input = {
    "queries": "Teacher\nhttps://www.google.com/search?q=doctor&ibp=htl;jobs",
    "maxPagesPerQuery": 1,
    "csvFriendlyOutput": False,
    "languageCode": "",
    "maxConcurrency": 10,
    "saveHtml": False,
    "saveHtmlToKeyValueStore": False,
    "includeUnfilteredResults": False,
}

# Run the Actor and wait for it to finish
run = client.actor("SpK8RxKhIgV6BWOz9").call(run_input=run_input)

# Fetch and print Actor results from the run's dataset (if there are any)
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    print(item)

In [5]:
def traverse_json(obj, indent=''):
    if isinstance(obj, dict):
        for key, value in obj.items():
            print(f"{indent}{key}:")
            traverse_json(value, indent + '  ')
    elif isinstance(obj, list):
        for item in obj:
            traverse_json(item, indent + '  ')
    else:
        print(f"{indent}{obj}")

# Replace 'json_object' with the actual JSON object you want to traverse

traverse_json(item)


searchQuery:
  term:
    doctor
  page:
    1
  type:
    SEARCH
  domain:
    google.com
  countryCode:
    US
  languageCode:
    None
  locationUule:
    None
  resultsPerPage:
    10
url:
  https://www.google.com/search?q=doctor&ibp=htl;jobs
hasNextPage:
  True
googleJobs:
    title:
      Veterinarian Doctor
    companyName:
      Lap of Love
    location:
        Vancouver, WA   
    via:
      via BeBee
    description:
      Are you ready for a change of pace from in-clinic practice? Looking to reconnect with pet families and provide the kind of service you'd always envisioned by having generous appointment times and minimal administrative work? At Lap of Love, our mission is to provide support to pets and their families during one of life's most challenging moments by helping pets pass peacefully in the comfort of... their own home.
Established and owned by veterinarians, we recognize that pets are more than just animals — they're beloved family members. That's why we prioriti

In [7]:
next(client.dataset(run["defaultDatasetId"]).iterate_items())

{'searchQuery': {'term': 'Teacher',
  'page': 1,
  'type': 'SEARCH',
  'domain': 'google.com',
  'countryCode': 'US',
  'languageCode': None,
  'locationUule': None,
  'resultsPerPage': 10},
 'url': 'https://www.google.com/search?ibp=htl;jobs&q=Teacher',
 'hasNextPage': True,
 'googleJobs': [{'title': 'Child Care Teacher',
   'companyName': 'Bright Horizons Family Solutions',
   'location': '  Gaithersburg, MD   ',
   'via': 'via Bright Horizons Careers',
   'description': 'Imagine your future as a teacher with a world-class team where you make a difference for children every day. Imagine learning from experts in your field, and having the opportunity to earn your college degree – for free. Imagine it all as a Bright Horizons Teacher.\n\nFull-time positions now available with Infants and Toddlers at our Bright Horizons at Discovery Meadows location in Gaithersburg... Maryland.\n\nA Bright Horizons Career Includes:\n• Flexible scheduling\n• Medical, dental, and vision insurance\n• 401(k

## Request the output by calling REST

In [3]:
import os
import requests
import json
import boto3

# Make sure the data folder exists
data_folder = "data"
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Make the GET request
response = requests.get(
    f"https://api.apify.com/v2/datasets/mj6RWZ0pjBY2aLhmq/items?token={APIFY_TOKEN}"
)

# Check if the request was successful
if response.status_code == 200:
    # Save the response content as JSON
    data = response.json()

    FILE_NAME = "data/output.json"
    # Save the JSON data to a file
    with open(FILE_NAME, "w") as file:
        json.dump(data, file)

    # Upload the JSON file to S3 bucket
    s3 = boto3.client("s3")  # Boto3 will automatically search for credentials
    bucket_name = "job-recommender-bucket-yusa"
    s3.upload_file(FILE_NAME, bucket_name, FILE_NAME)
else:
    print("Error: Failed to retrieve data")

## Peak the returned JSON

In [6]:
len(data)

4

In [7]:
page_1_jobs = data[1]["googleJobs"]

In [8]:
len(page_1_jobs)

10

In [9]:
page_1_jobs[0].keys()

dict_keys(['title', 'companyName', 'location', 'via', 'description', 'jobHighlights', 'relatedLinks', 'thumbnail', 'extras', 'metadata', 'applyLink'])

In [13]:
job_0 = page_1_jobs[0]
job_0

{'title': 'Data Scientist II (Canada Remote)',
 'companyName': 'Fullscript',
 'location': '  Ottawa, ON, Canada   ',
 'via': 'via Work In Tech Job Board - Communitech',
 'description': "Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.\n\nWe know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.\n\nFullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can’t be found anywhere else.\n\nWith over 5 million

In [34]:
job_0["extras"][0]

'Full-time'

In [30]:
job_0['applyLink']['link']

'https://www1.communitech.ca/companies/fullscript/jobs/31257911-data-scientist-ii-canada-remote?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic'

### Note that the description and jonHighlights have the same information

In [28]:
print(job_0['description'])

Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.

We know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.

Fullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can’t be found anywhere else.

With over 5 million patients served and 70,000+ practitioners supported across North America, we’re just getting started.

Come build a healthier future with us.

The Data team is a key strategic partner at... F

In [26]:
test_description = job_0["jobHighlights"][0]["items"][0]
print(test_description)

Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.

We know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.

Fullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can’t be found anywhere else.

With over 5 million patients served and 70,000+ practitioners supported across North America, we’re just getting started.

Come build a healthier future with us.

The Data team is a key strategic partner at... F

## Explorer other saving options

In [None]:
import boto3

# Create a DynamoDB client
dynamodb = boto3.client('dynamodb')

# Define the table name
table_name = 'your_table_name'

# Define the dictionary to be saved
data = {'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}

# Save the dictionary to DynamoDB
response = dynamodb.put_item(
    TableName=table_name,
    Item=data
)

# Print the response
print(response)


# Langchain parsing

In [36]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [41]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.prompts import PromptTemplate

llm = OpenAI(temperature=0.9, max_tokens=3000)

In [43]:
system_message_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        template="For the following json objet, please help me parse the following infotmation and \
            save into a json object with the following keys \
                (job_title, company_name, location, source, \
                    full_text_description, job_type(full_time/ part_time/ contract/ ...), link)",
        input_variables=[],
    )
)

human_message_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        template="The following is the parsed job description json object {json}?",
        input_variables=["json"],
    )
)
chat_prompt_template = ChatPromptTemplate.from_messages(
    [system_message_prompt, human_message_prompt]
)
chat = ChatOpenAI(temperature=0.9)
chain = LLMChain(llm=chat, prompt=chat_prompt_template)
parsed_jd = chain.run(json=job_0)

print(parsed_jd)

The parsed job description JSON object can be transformed into the following format:

{
   "job_title": "Data Scientist II (Canada Remote)",
   "company_name": "Fullscript",
   "location": "Ottawa, ON, Canada",
   "source": "via Work In Tech Job Board - Communitech",
   "full_text_description": "Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.\n\nWe know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.\n\nFullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an 