In [37]:
import spacy
import openai
import json
import os
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_random_exponential
from random import sample, seed
import re

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
custom_ner = nlp.add_pipe("ner", name="custom_ner")
custom_ner.add_label("TECHNOLOGY")

1

In [5]:
load_dotenv()

True

## Before putting anything in to the openai api
Set up a binary classifier to identify if a given paragraph has any relevance to the task.  I.e. if it contains any technologies/skills that I might want to take out.  
### First step is to grab a bunch of paragraphs from job descriptions, label a few hundred as relevant or not.

In [24]:
job_desc_list = []
with open(fr"data/raw_data-09-09-23.json") as f:
    data = json.load(f)

In [27]:
len(job_desc_list)

272

In [25]:
for key in list(data.keys()):
    if key.startswith("metadata"):
        continue
    else:
        job_desc_list.append(data[key]['desc'])

In [26]:
len(job_desc_list)/2

136.0

In [28]:
#not many jobs in total here, randomly sample half of them to label. set seed for reproducability
seed(10) 
jobs_to_label = sample(job_desc_list, int(len(job_desc_list)/2))

In [38]:
# print(jobs_to_label[85])

In [51]:
def split_on_empty_lines(text):
    """Used to try and split paragraphs where there are 2 or more newlines in a row"""
    blank_line_regex = r"(?:\r?\n){1,}"
    return re.split(blank_line_regex, text.strip())

In [53]:
split_on_empty_lines(jobs_to_label[0])

['Collaborate closely with cross-functional teams including data scientists, product managers, and software developers to understand requirements and objectives for building a benchmark chat application. ',
 ' Leverage your in-depth knowledge of the Google Cloud Platform and associated tools to design, develop, and deploy scalable and reliable solutions. ',
 ' Utilize Google Gen AI App Builder to create a chat application that showcases the capabilities of AI-powered conversation models. ',
 ' Work with existing data sets to ensure consistent and relevant input for the chat application, ensuring optimal performance and benchmarking accuracy. ',
 ' ',
 ' Qualifications and Skills: ',
 ' ',
 ' Data engineer or software engineer who must be hands on with GCP platform and tools, and understand Gen AI technologies (Google Gen AI App Builder). ',
 ' Use Google Gen AI App Builder to build benchmark chat based on same data sets ',
 ' Deep understanding of General Artificial Intelligence (Gen A

In [36]:
[(x.strip()," ") for x in jobs_to_label[2].split("\n\n") if x.strip() != ""]

[("Extreme Networks Named to Computerworld’s 2023 List of Best Places to Work in IT! \n \n \n   Over 50,000 customers globally trust our end-to-end, cloud-driven networking solutions and rely on our top-rated services and support to accelerate their digital transformation efforts and deliver progress like never before and with double digit growth year over year, no provider is better positioned to deliver better outcomes on scale, than Extreme.\n  \n \n   We believe in \n   “walking the walk”  of our strong core values which enable us to successfully advance together. Diversity and Inclusion is a vital part of our values and beliefs, and we’re proud to foster an environment where every Extreme employee can thrive.\n  \n \n   Come become part of something big with us! We are a global leader, with hubs in North America, South America, Asia Pacific, Europe, and the Middle East.\n  \n \n \n  The Systems Engineer will be an integral member supporting the Sales Account Executive team in a Pr

### Second step is to featurize these labeled data

In [20]:
api_key = os.getenv("openai_api_key")
example_text_1 = os.getenv("example_text_1")
example_text_2 = os.getenv("example_text_2")

In [12]:
# Originally from getTechs.py
def get_techs(text):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role":"system", "content":"You identify specific technology names from job descriptions.  Respond only with a list of the names of the specific technologies."},
            {"role":"user", "content":f"Identify the specific technologies in the following text and respond with them in a list: {example_text_1}"},
            {"role":"assistant", "content":"llama 2, python, pytorch"},
            {"role":"user", "content":f"Identify the specific technologies in the following text: {example_text_2}"},
            {"role":"assistant", "content":"python, spark, sql, snowflake, tableau, aws, azure, power bi"},
            {"role":"user", "content":f"Identify the specific technologies in the following text: {text}"}
        ]
    )
    return response

In [9]:
def dict_to_json(dict, filepath):
    with open(filepath, "w") as out:
        json.dump(dict, out)

In [None]:
# Originally from getTechs.py
# Modified to remove counter, use to just get my spacy training data.

### Does the original json file need to be closed?  I'm unsure how it works when renaming then dumping.  The new file is definitely closed as it's in the with open() command
### Original file may not be? Think about and fix if necessary
def print_attempt_number(retry_state):
    print(f"Retrying: {retry_state.attempt_number}...")
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), after=print_attempt_number)
def get_tech_list(filepath):
    f = open(filepath)
    data = json.load(f)
    for key in list(data.keys()):
        job_desc = data[key]['desc']
        response = get_techs(job_desc) # keep whole object in case I want metadata from it later
        response_list = [x.lower() for x in response["choices"][0]["message"]["content"].split(", ")]
        data[key]['techs'] = response_list 
    # dict_to_json originally from jobSearch.py, use to indicate which files have already been parsed
    # Right now saves file separately, keeping original.  When done with testing will want it to overwrite.  Then it won't matter if the original is saved or not as we'll delete it anyways.
    dict_to_json(data, f"p-{filepath}")  

In [None]:
# for filename in os.listdir("data"):
#     if filename.startswith("p-"): #already been parsed
#         continue
#     else:
#         filepath = fr"data/{filename}"
#         get_tech_list(filename) # remember this saves the files separately, keeping the original for now.
        

Above will put the openai decided techs into a list in data[job_id]['techs'].  Can now use data itself for training spacy

In [28]:
# Test with only the first json now
# Hit error with rate limits on openai api
# https://platform.openai.com/docs/guides/rate-limits/error-mitigation
get_tech_list(fr"data/07-09-23-q-data+science-data+analysis-data+engineer-mle-machine+learning-mlops-l-remote.json")

RetryError: RetryError[<Future at 0x1b20bea8af0 state=finished raised RateLimitError>]