Should change the get_tech.py to now save shortened texts as short_desc rather than clean_desc.  Then here we can add clean_desc for when we aren't using openai API.  Change previous .json files as well if we do this.

In [100]:
import spacy
import pickle
import json
import re
import os
import random

from spacy.training.example import Example
from spacy.scorer import Scorer

In [2]:
nlp = spacy.load("en_core_web_sm")

# Step 0: Remove bad techs
Before we do any training or NER we want to remove the techs that don't belong that the OpenAI API picked up anyways.

# Step 1: Collect and annotate data
This was done with job_search.py and get_techs.py

In [5]:
filename = "p-raw_data-17-09-23.json" # one test json file, will iterate through what I need later

In [6]:
with open(fr"data/{filename}") as f:
    data = json.load(f)

# Step 2: Data preprocessing
We must clean/format our job descriptions and tech entities, then convert the data into spaCy training data format.

In [42]:
training_data = []
for key in data:
    if key == 'metadata':
        continue
    else:
        # print(data[key]['cleaned_desc'])
        # print(data[key]['techs'])
        technologies = data[key]['techs']
        
        for tech in technologies:
            tech = tech.lower()  # Convert the technology to lowercase
            for match in re.finditer(r'\b' + re.escape(tech) + r'\b', data[key]['cleaned_desc'].lower()):
                start, end = match.start(), match.end()
                training_data.append((data[key]['cleaned_desc'].lower(), {"entities": [(start, end, "TECH")]}))

    
# Below fails as it'll find first match
# E.g. for data["dd60a69f8f6a7bd7"]['cleaned_desc'].lower(), 'r' programming language is found at [23:24] in the word 'large'
        # for tech in technologies:
        #     start = data[key]['cleaned_desc'].lower().find(tech)
        #     if start != -1:
        #         end = start + len(tech)
        #         training_data.append((data[key]['cleaned_desc'].lower(), {"entities": [(start, end, "TECH")]}))

In [43]:
training_data[:2]

[(" \n collect and clean large datasets from various sources \n analyze data using statistical techniques and machine learning algorithms \n develop predictive models and perform data visualization \n collaborate with cross-functional teams to identify business problems and propose data-driven solutions   bachelor's degree in a quantitative field such as computer science, statistics, mathematics, or related fields \n experience in data analysis, data mining, and data visualization techniques \n proficiency in programming languages such as python or r \n strong understanding of statistical modeling and machine learning algorithms \n ability to work with large datasets and perform data cleaning and preprocessing ",
  {'entities': [(536, 542, 'TECH')]}),
 (" \n collect and clean large datasets from various sources \n analyze data using statistical techniques and machine learning algorithms \n develop predictive models and perform data visualization \n collaborate with cross-functional tea

In [78]:
def get_training_data(filenames):
    """Function to get the spaCy training data from our shortened job descriptions and OpenAI API results.
    Returns spaCy training data

    Args:
        filenames (list): List of filenames of parsed .json files to be used to generate training data
    """
    training_data = []
    for filename in filenames:
        with open(fr"data/{filename}") as f:
            data = json.load(f)
        
        for key in data:
            if key == 'metadata':
                continue
            else:
                technologies = data[key]['techs'] 
                if len(technologies) > 0:  # there's at least 1 tech
                    lower_job_desc = data[key]['cleaned_desc'].lower()
                    entities = [] # init list to store techs for a given jd
                    for tech in technologies:
                        tech = tech.lower()  # Convert the technology to lowercase, should already be, but make sure.
                        for match in re.finditer(r'\b' + re.escape(tech) + r'\b', lower_job_desc):
                            start, end = match.start(), match.end()
                            entities.append((start, end, "TECH"))
                    training_data.append((lower_job_desc, {"entities": entities}))
                else:
                    continue
    return training_data

In [79]:
filenames = [filename for filename in os.listdir("data") if filename.startswith("p-raw")]

In [80]:
training_data = get_training_data(filenames)

In [90]:
# Split training data into training and validation sets.
random.seed(1234)
random.shuffle(training_data)
validation_split = int(0.2 * len(training_data)) # get the number for a 80-20 train-valid split
training_set = training_data[:-validation_split]
validation_set = training_data[-validation_split]

# Step 3 - Init NER model
We can either use a blank spaCy model or a pre-trained one to fine-tune.  Let's start with a blank one and see how it does.

In [91]:
# # creates a blank NLP, commented out as we have one now.
# blank_nlp = spacy.blank("en")
# ner = blank_nlp.create_pipe("ner")
# ner.add_label("TECH")
# blank_nlp.add_pipe("ner")

# blank_nlp.to_disk("data/ner_models/nlp_v1.0")

# Step 4 - Training NER model

In [92]:
nlp_version = "data/ner_models/nlp_v1.0"

Below broken, seems to be due to extraneous techs that openai api picked up.  Should clean them up before using them here.

In [111]:
# Load in the saved blank we had
nlp = spacy.load(nlp_version)

# number of epochs to train
num_iters = 5 

# Train the NER component
for _ in range(num_iters):
    losses = {}
    examples = []

    for text, annotations in training_set:
        try:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        except ValueError as e:
            print(e)
            print(annotations)
            print(text)
            break
    nlp.update(examples, drop=0.5, losses=losses)
    print("Losses:", losses)

[E103] Trying to set conflicting doc.ents: '(561, 571, 'TECH')' and '(549, 571, 'TECH')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
{'entities': [(23, 50, 'TECH'), (72, 87, 'TECH'), (135, 163, 'TECH'), (189, 211, 'TECH'), (216, 233, 'TECH'), (235, 245, 'TECH'), (462, 472, 'TECH'), (561, 571, 'TECH'), (251, 259, 'TECH'), (421, 435, 'TECH'), (448, 457, 'TECH'), (1244, 1253, 'TECH'), (488, 509, 'TECH'), (521, 524, 'TECH'), (1255, 1258, 'TECH'), (529, 544, 'TECH'), (549, 571, 'TECH'), (691, 713, 'TECH'), (797, 820, 'TECH'), (917, 933, 'TECH'), (935, 957, 'TECH'), (1133, 1151, 'TECH'), (1184, 1198, 'TECH'), (1200, 1203, 'TECH'), (1205, 1217, 'TECH'), (1219, 1224, 'TECH'), (448, 457, 'TECH'), (1244, 1253, 'TECH'), (521, 524, 'TECH'), (1255, 1258, 'TECH'), (144, 163, 'TECH'), (1325, 1344, 'TECH'), (1346, 1352, 'TECH'), (1354, 1358, 'TECH'), (1385, 1407, 'TECH'), (23, 42, 

In [None]:
nlp.to_disk(nlp_version) # save the updated nlp after training

In [None]:
eval_results = 