In [1]:
import os
from dotenv import load_dotenv

# Load variables from .env file
load_dotenv()

# Access the variables
APIFY_TOKEN = os.getenv("APIFY_TOKEN")

## Live running the Apify, could simiply request the output instead

In [None]:
from apify_client import ApifyClient

# Initialize the ApifyClient with your API token
client = ApifyClient(APIFY_TOKEN)

# Prepare the Actor input
run_input = {
    "queries": "Teacher\nhttps://www.google.com/search?q=doctor&ibp=htl;jobs",
    "maxPagesPerQuery": 1,
    "csvFriendlyOutput": False,
    "languageCode": "",
    "maxConcurrency": 10,
    "saveHtml": False,
    "saveHtmlToKeyValueStore": False,
    "includeUnfilteredResults": False,
}

# Run the Actor and wait for it to finish
run = client.actor("SpK8RxKhIgV6BWOz9").call(run_input=run_input)

# Fetch and print Actor results from the run's dataset (if there are any)
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    print(item)

In [5]:
def traverse_json(obj, indent=''):
    if isinstance(obj, dict):
        for key, value in obj.items():
            print(f"{indent}{key}:")
            traverse_json(value, indent + '  ')
    elif isinstance(obj, list):
        for item in obj:
            traverse_json(item, indent + '  ')
    else:
        print(f"{indent}{obj}")

# Replace 'json_object' with the actual JSON object you want to traverse

traverse_json(item)


searchQuery:
  term:
    doctor
  page:
    1
  type:
    SEARCH
  domain:
    google.com
  countryCode:
    US
  languageCode:
    None
  locationUule:
    None
  resultsPerPage:
    10
url:
  https://www.google.com/search?q=doctor&ibp=htl;jobs
hasNextPage:
  True
googleJobs:
    title:
      Veterinarian Doctor
    companyName:
      Lap of Love
    location:
        Vancouver, WA   
    via:
      via BeBee
    description:
      Are you ready for a change of pace from in-clinic practice? Looking to reconnect with pet families and provide the kind of service you'd always envisioned by having generous appointment times and minimal administrative work? At Lap of Love, our mission is to provide support to pets and their families during one of life's most challenging moments by helping pets pass peacefully in the comfort of... their own home.
Established and owned by veterinarians, we recognize that pets are more than just animals — they're beloved family members. That's why we prioriti

In [7]:
next(client.dataset(run["defaultDatasetId"]).iterate_items())

{'searchQuery': {'term': 'Teacher',
  'page': 1,
  'type': 'SEARCH',
  'domain': 'google.com',
  'countryCode': 'US',
  'languageCode': None,
  'locationUule': None,
  'resultsPerPage': 10},
 'url': 'https://www.google.com/search?ibp=htl;jobs&q=Teacher',
 'hasNextPage': True,
 'googleJobs': [{'title': 'Child Care Teacher',
   'companyName': 'Bright Horizons Family Solutions',
   'location': '  Gaithersburg, MD   ',
   'via': 'via Bright Horizons Careers',
   'description': 'Imagine your future as a teacher with a world-class team where you make a difference for children every day. Imagine learning from experts in your field, and having the opportunity to earn your college degree – for free. Imagine it all as a Bright Horizons Teacher.\n\nFull-time positions now available with Infants and Toddlers at our Bright Horizons at Discovery Meadows location in Gaithersburg... Maryland.\n\nA Bright Horizons Career Includes:\n• Flexible scheduling\n• Medical, dental, and vision insurance\n• 401(k

## Request the output by calling REST

In [2]:
import os
import requests
import json
import boto3

# # Make sure the data folder exists
# data_folder = "data"
# if not os.path.exists(data_folder):
#     os.makedirs(data_folder)

# Make the GET request
response = requests.get(
    f"https://api.apify.com/v2/datasets/mj6RWZ0pjBY2aLhmq/items?token={APIFY_TOKEN}"
)

# Check if the request was successful
if response.status_code == 200:
    # Save the response content as JSON
    data = response.json()

    FILE_NAME = "../data/output.json"
    # Save the JSON data to a file
    with open(FILE_NAME, "w") as file:
        json.dump(data, file)

    # Upload the JSON file to S3 bucket
    s3 = boto3.client("s3")  # Boto3 will automatically search for credentials
    bucket_name = "job-recommender-bucket-yusa"
    s3.upload_file(FILE_NAME, bucket_name, FILE_NAME)
else:
    print("Error: Failed to retrieve data")

## Peak the returned JSON

In [3]:
len(data)

4

In [5]:
data[0]

{'searchQuery': {'term': 'data science jobs in Canada',
  'page': 1,
  'type': 'SEARCH',
  'domain': 'google.com',
  'countryCode': 'US',
  'languageCode': None,
  'locationUule': None,
  'resultsPerPage': 10},
 'url': 'https://www.google.com/search?q=data+science+jobs+in+Canada&oq=google+jobs&gs_lcrp=EgZjaHJvbWUqCggCEAAYsQMYgAQyDggAEEUYJxg7GIAEGIoFMgYIARBFGEAyCggCEAAYsQMYgAQyCggDEAAYkgMYgAQyCggEEAAYsQMYgAQyBggFEEUYPDIGCAYQRRhBMgYIBxBFGEHSAQg0MTM3ajBqN6gCALACAA&sourceid=chrome&ie=UTF-8&ibp=htl;jobs&sa=X&ved=2ahUKEwiSkb2Ov4uDAxWRIDQIHWImCI0QudcGKAF6BAggECo&sxsrf=AM9HkKnL3kWnQdaFMTTB0VMmiGhqogChaA:1702438898722#fpstate=tldetail&htivrt=jobs&htidocid=5Au9kxSU8z87m4ltAAAAAA%3D%3D',
 'hasNextPage': True,
 'googleJobs': [{'title': 'Data Scientist',
   'companyName': 'Teranet Inc.',
   'location': '  Toronto, ON, Canada   ',
   'via': 'via LinkedIn',
   'description': 'Who We Are\n\nTeranet is Canada’s leader in the delivery and transformation of statutory registry services with extensive expe

In [7]:
page_1_jobs = data[1]["googleJobs"]

In [8]:
len(page_1_jobs)

10

In [9]:
page_1_jobs[0].keys()

dict_keys(['title', 'companyName', 'location', 'via', 'description', 'jobHighlights', 'relatedLinks', 'thumbnail', 'extras', 'metadata', 'applyLink'])

In [13]:
job_0 = page_1_jobs[0]
job_0

{'title': 'Data Scientist II (Canada Remote)',
 'companyName': 'Fullscript',
 'location': '  Ottawa, ON, Canada   ',
 'via': 'via Work In Tech Job Board - Communitech',
 'description': "Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.\n\nWe know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.\n\nFullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can’t be found anywhere else.\n\nWith over 5 million

In [34]:
job_0["extras"][0]

'Full-time'

In [30]:
job_0['applyLink']['link']

'https://www1.communitech.ca/companies/fullscript/jobs/31257911-data-scientist-ii-canada-remote?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic'

### Note that the description and jobHighlights have the same information

In [28]:
print(job_0['description'])

Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.

We know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.

Fullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can’t be found anywhere else.

With over 5 million patients served and 70,000+ practitioners supported across North America, we’re just getting started.

Come build a healthier future with us.

The Data team is a key strategic partner at... F

In [26]:
test_description = job_0["jobHighlights"][0]["items"][0]
print(test_description)

Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.

We know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.

Fullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can’t be found anywhere else.

With over 5 million patients served and 70,000+ practitioners supported across North America, we’re just getting started.

Come build a healthier future with us.

The Data team is a key strategic partner at... F

## Read out temp local data 

In [None]:
import json
import pprint

# Specify the file path
file_path = "../data/output.json"

# Read the JSON file
with open(file_path, "r") as file:
    data = json.load(file)

# Print the data
pprint.pprint(data)


In [None]:
def apify_get_clean():
    jobs = run_actor()
    batch_jobs = []
    for page in jobs:
        jobs_info = page["googleJobs"]
        batch_jobs.extend(jobs_info)

    print(len(batch_jobs), "been scraped")
    return batch_jobs

In [8]:
job_0 = data[1]["googleJobs"][0]
job_0

{'title': 'Data Scientist II (Canada Remote)',
 'companyName': 'Fullscript',
 'location': '  Ottawa, ON, Canada   ',
 'via': 'via Work In Tech Job Board - Communitech',
 'description': "Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.\n\nWe know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.\n\nFullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can’t be found anywhere else.\n\nWith over 5 million

In [9]:
job_0.keys()

dict_keys(['title', 'companyName', 'location', 'via', 'description', 'jobHighlights', 'relatedLinks', 'thumbnail', 'extras', 'metadata', 'applyLink'])

In [12]:
def filter_full_job_info(job):
    keys_to_filter = [
        "title",
        "companyName",
        "location",
        "via",
        "description",
        "metadata",
        # "applyLink",
    ]
    job_filtered = {key: job[key] for key in keys_to_filter}
    job_filtered["applyLink"] = job["applyLink"]["link"]
    job_filtered["metadata"] = "".join(job["metadata"].values())

    return job_filtered

In [13]:
filter_full_job_info(job_0)

{'title': 'Data Scientist II (Canada Remote)',
 'companyName': 'Fullscript',
 'location': '  Ottawa, ON, Canada   ',
 'via': 'via Work In Tech Job Board - Communitech',
 'description': "Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.\n\nWe know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.\n\nFullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can’t be found anywhere else.\n\nWith over 5 million

In [10]:
# Serailice the dict for embedding

job_0_str = json.dumps(job_0)
print(job_0_str)


{"title": "Data Scientist II (Canada Remote)", "companyName": "Fullscript", "location": "  Ottawa, ON, Canada   ", "via": "via Work In Tech Job Board - Communitech", "description": "Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.\n\nWe know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.\n\nFullscript\u2019s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare\u2019s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can\u2019t be found anywhere else.\n\nWith ove

# Langchain parsing

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [12]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.prompts import PromptTemplate

llm = OpenAI(temperature=0.9, max_tokens=3000)

In [43]:
system_message_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        template="For the following json objet, please help me parse the following infotmation and \
            save into a json object with the following keys \
                (job_title, company_name, location, source, \
                    full_text_description, job_type(full_time/ part_time/ contract/ ...), link)",
        input_variables=[],
    )
)

human_message_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        template="The following is the parsed job description json object {json}?",
        input_variables=["json"],
    )
)
chat_prompt_template = ChatPromptTemplate.from_messages(
    [system_message_prompt, human_message_prompt]
)
chat = ChatOpenAI(temperature=0.9)
chain = LLMChain(llm=chat, prompt=chat_prompt_template)
parsed_jd = chain.run(json=job_0)

print(parsed_jd)

The parsed job description JSON object can be transformed into the following format:

{
   "job_title": "Data Scientist II (Canada Remote)",
   "company_name": "Fullscript",
   "location": "Ottawa, ON, Canada",
   "source": "via Work In Tech Job Board - Communitech",
   "full_text_description": "Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.\n\nWe know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.\n\nFullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an 

## Load parsed job info to Dynamo DB

In [34]:
DynamoDB_TABLE_NAME = "jobs-db-yusa"

In [37]:
## Explorer other saving options
import boto3

# Create a DynamoDB client
dynamodb = boto3.client("dynamodb")

# Define the table name
table_name = DynamoDB_TABLE_NAME

# Define the dictionary to be saved
data = {
    "id": {"S": "1"},
    "job_title": {"S": "Data Scientist II (Canada Remote)"},
    "company_name": {"S": "Fullscript"},
    "location": {"S": "Ottawa, ON, Canada"},
    "source": {"S": "via Work In Tech Job Board - Communitech"},
    "full_text_description": {
        "S": "Embracing a new era in patient care... [truncated for brevity]"
    },
    "job_type": {"S": "Full-time"},
    "link": {
        "S": "https://www1.communitech.ca/companies/fullscript/jobs/31257911-data-scientist-ii-canada-remote?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic"
    },
}


# Save the dictionary to DynamoDB
response = dynamodb.put_item(TableName=table_name, Item=data)

# Print the response
print(response)

{'ResponseMetadata': {'RequestId': 'IM9MVTOSTS020M2BNFP47219TFVV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'server': 'Server', 'date': 'Fri, 05 Jan 2024 04:12:44 GMT', 'content-type': 'application/x-amz-json-1.0', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': 'IM9MVTOSTS020M2BNFP47219TFVV4KQNSO5AEMVJF66Q9ASUAAJG', 'x-amz-crc32': '2745614147'}, 'RetryAttempts': 0}}


In [30]:
parsed_jd_temp = {
    "job_title": "Data Scientist II (Canada Remote)",
    "company_name": "Fullscript",
    "location": "Ottawa, ON, Canada",
    "source": "via Work In Tech Job Board - Communitech",
    "full_text_description": "Embracing a new era in patient care, Fullscript is a trailblazing health technology company changing the way health is prescribed.\n\nWe know the conventional model of patient care is ripe for innovation. At Fullscript, our work is guided by our mission of creating meaningful change for the future of our healthcare systems, so that integrative medicine will one day simply be called medicine.\n\nFullscript’s care delivery platform combines customized treatment planning, wellness protocols, data insights and ongoing wellness support, with the ability to prescribe and manage the use of healthcare’s best supplements all in one place. Above all else, Fullscript is trusted by practitioners due to an unrelenting focus on quality and a patient experience that can’t be found anywhere else.\n\nWith over 5 million patients served and 70,000+ practitioners supported across North America, we’re just getting started.\n\nCome build a healthier future with us.\n\nThe Data team is a key strategic partner at... Fullscript, responsible for providing analytical support for all internal functions to drive effective planning and decision making. Reporting to a Technical Lead, the Data Scientist will perform in-depth exploration and analysis of company data in order to guide goals, improve processes, and help our leaders understand and track key metrics.\nWhat you'll do\n• Partner with key stakeholders to dive deep into the business challenges and identify insights to drive company strategies\n• Build statistical models and partner with our machine learning team to productionize impactful machine learning solutions\n• Improve data availability and data literacy across the organization\n• Collaborate with the Analytics Engineering team to improve the availability of reliable data and automation of recurring business needs\n• Partner with stakeholders to create meaningful goals and targets for both team and company-level initiatives\n• Support, facilitate, and monitor strategic objectives through the creation of data science models, segmentation, and analytical testing\n• Guide and advise business functions on the collection of data to broaden our visibility into company activities\n• Proactively contribute to our shared documentation store and self-serve Business Intelligence artifacts\n• Proactively watch for opportunities to improve code quality and scalability within our codebase\n• Define, design, and develop dashboards and artifacts for key metrics and KPIs using various data languages and Business Intelligence systems\n• Other duties may be assigned as determined by the Company from time to time\nWhat you bring to the table\n• Minimum Bachelor’s Degree in a quantitative, analytical, or similar field with 2+ years of relevant experience successfully working in a data role using relational databases\n• Experience building Statistical and Machine Learning models using a programming language like Python.\n• Experience working with Business Intelligence systems and tools (BI tooling, databases, spreadsheets)\n• Proficiency in SQL including basic transformations\n• Creative problem-solving skills with a passion for uncovering strategic opportunities\n• Proven ability to work independently in a fast-paced environment, managing competing priorities on multiple simultaneous projects\n• Demonstrated attention to detail\n• Strong written and verbal communication skills with the ability to effectively communicate complex ideas and concepts to both technical and non-technical teams\nBonus if you have:\n• Experience with Snowflake, MySQL\n• Experience with Looker\n• Experience with Python\n• Experience with HTML/CSS\n• Experience in B2B or SaaS, working in a high-growth environment\n• Interest or experience in Health Tech, Integrative Medicine, or supplements\nWhat we can offer you\n• Generous PTO and competitive pay\n• Fullscript’s RRSP match program for financial health\n• Flexible benefits package and workplace wellness program\n• Training budget and company-wide learning initiatives\n• Discount on Fullscript catalog of products\n• Ability to work Wherever You Work Well*\n• Our Wherever You Work Well philosophy means Fullscript teammates get to pick their own office — whether that’s in-office, at home, or a bit of both 🐶🏡\n\nFullscript is committed to diversity in its workforce and is proud to be an equal opportunity employer. We are excited to work with talented people, period. All employment decisions are based on business needs, job requirements, and individual qualifications, without regard to race, color, religion or belief, national or ethnic origin, gender, age, disability, sexual orientation, gender identity and/or expression, marital or civil status, political affiliation, family or parental status, or any other status protected by the laws or regulations in the jurisdictions in which we operate.\n\nAccommodations are available on request for candidates taking part in all aspects of the selection process. Please send an email to accommodations@fullscript.com and let us know the nature of your request and your contact information.\n\nOur team handles a lot of sensitive information, which means we require all candidates that receive and accept employment offers to complete a background check before being hired",
    "job_type": "Full-time",
    "link": "https://www1.communitech.ca/companies/fullscript/jobs/31257911-data-scientist-ii-canada-remote?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic",
}

In [33]:
list(parsed_jd_temp.keys())

['job_title',
 'company_name',
 'location',
 'source',
 'full_text_description',
 'job_type',
 'link']

In [13]:
from openai import OpenAI
client = OpenAI()


def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


job_0_emb = get_embedding(job_0_str)

In [15]:
len(job_0_emb)

1536

In [43]:
job_0_emb

[0.002754964167252183,
 0.006326977163553238,
 0.0019464981742203236,
 -0.02489595115184784,
 -0.014130134135484695,
 0.03880637511610985,
 -0.01592901535332203,
 -0.007923311553895473,
 0.00048190419329330325,
 -0.037680357694625854,
 0.01792014203965664,
 -0.003410662990063429,
 -0.022053446620702744,
 -0.027189180254936218,
 8.732635615160689e-05,
 0.010559838265180588,
 0.03427484631538391,
 -0.046990592032670975,
 -0.005496196448802948,
 -0.01728847436606884,
 -0.02057039923965931,
 0.0010513493325561285,
 0.00864423718303442,
 0.013354281894862652,
 -0.00864423718303442,
 -0.0016323806485161185,
 -0.00306908180937171,
 -0.018057459965348244,
 -0.016189921647310257,
 -0.004720343742519617,
 0.0036698526237159967,
 -0.01495404914021492,
 -0.0013611754402518272,
 -0.027793383225798607,
 -0.00828720722347498,
 -0.00948874931782484,
 -0.003513652365654707,
 0.005266187246888876,
 0.018373293802142143,
 -0.010607900097966194,
 0.014720606617629528,
 0.042376670986413956,
 -0.0015645794

## Load parsed job info to Opensearch 

In [3]:
import boto3
import requests
from requests_aws4auth import AWS4Auth
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth

region = "us-east-2"  # e.g. us-west-1
service = "es"
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(
    os.getenv("AWS_ACCOUNT_ID"),
    os.getenv("AWS_ACCESS_KEY"),
    region,
    service,
    session_token=credentials.token,
)

# The OpenSearch domain endpoint
host = "search-ui-test-j4slt3fz7uu5ahsm6vgu3hwqem.us-east-2.es.amazonaws.com"  # e.g. https://search-mydomain.us-west-1.es.amazonaws.com

# # The index and document type
# index = "jb-index"
# type = "_doc"

# # Initialize OpenSearch client
# es = Elasticsearch(
#     hosts=[{"host": host, "port": 443}],
#     http_auth=awsauth,
#     # use_ssl=True,
#     verify_certs=True,
#     node_class="requests"
#     # connection_class=RequestsHttpConnection,
# )

# # JSON document you want to save
# document = {"id": "1", "title": "Example title", "content": "Sample content"}

# # Save the document in OpenSearch
# response = es.index(index=index_name, id=document["id"], body=document)
# print(response)

In [23]:
client = boto3.client("opensearch")
response = client.describe_domains(
    DomainNames=[
        "vs-test-domain",
    ]
)
print(response)

{'ResponseMetadata': {'RequestId': 'e7f8cfa0-2c29-4d1b-8756-d39fcd2e08b3', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e7f8cfa0-2c29-4d1b-8756-d39fcd2e08b3', 'content-type': 'application/json', 'content-length': '23', 'date': 'Mon, 08 Jan 2024 04:14:24 GMT'}, 'RetryAttempts': 0}, 'DomainStatusList': []}


In [4]:
client = OpenSearch(
    hosts=[{"host": host, "port": 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
)

In [5]:
index_name = "python-test-index-2"

client.indices.exists(index=index_name)

True

In [21]:
def create_index(index_name):
    # Create an index with non-default settings.
    # https://docs.aws.amazon.com/opensearch-service/latest/developerguide/sizing-domains.html#bp-sharding
    # shards should correspond to 10-30GB where search latency is objective
    # should be 30-50GB for write-heavy jobs like log analytics
    # by default, each index == 5 primary shards + 1 replica/primary == 10x total shards
    # replica shard == copy of a primary shard
    # shards are distributed amongst nodes for resilience
    # index > shards > multiple nodes (assuming you have more than 1)
    # lower shard count == faster reads
    # higher shard count == faster writes
    index_body = {"settings": {"index": {"number_of_shards": 1}}}

    try: 
        if client.indices.exists(index=index_name):
            print(f"Index {index_name} already exists")
            return
        response = client.indices.create(index_name, body=index_body)
        print("\nCreating index:")
        print(response)
    except Exception as e:
        print(e)


create_index(index_name)
print()

AuthorizationException(403, '{"message":"The security token included in the request is invalid."}')



In [6]:
# Add a document to the index.
document = {"title": "Moneyball", "director": "Bennett Miller", "year": "2011"}
id = "1"

response = client.index(index=index_name, body=document, id=id, refresh=True)

print("\nAdding document:")
print(response)


Adding document:
{'_index': 'python-test-index-2', '_id': '1', '_version': 2, 'result': 'updated', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}


## Pinecone vector DB 

In [42]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")

In [40]:
import pinecone

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)

NameError: name 'PINECONE_API_KEY' is not defined

In [21]:
PINECONE_INDEX = "jd-index"

In [22]:
pinecone.create_index(PINECONE_INDEX, dimension=1536, metric="cosine")
pinecone.describe_index(PINECONE_INDEX)

IndexDescription(name='jd-index', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

In [23]:
index = pinecone.Index(PINECONE_INDEX)

In [45]:
index.upsert(
    vectors=[
        {"id": "job_0", "values": job_0_emb},
    ],
    namespace="ns1",
)

{'upserted_count': 1}

In [29]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 1e-05,
 'namespaces': {'ns1': {'vector_count': 1}},
 'total_vector_count': 1}

In [None]:
index.query(
    namespace="ns1",
    vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
    top_k=3,
    include_values=True,
)

In [None]:
pinecone.delete_index("quickstart")