In [1]:
import json
import os
from collections import Counter, defaultdict, OrderedDict
import copy
from datetime import datetime
import re
import pandas as pd
import numpy as np

import spacy
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def order_techs(counter):
    """Order the techs in our tech counter by number of occurances, by search term.

    Args:
        counter (defaultdict): defaultdict of our Counter for techs by search term

    Returns:
        ord (dict): dict containing an OrderedDict of techs by search term
    """
    ord = {k: OrderedDict(sorted(v.items(), key=lambda x: float(x[1]), reverse=True)) for k, v in counter.items()}
    return ord

In [3]:
def get_techs(filelist):
    """Get the name of and count of techs, by term, from our job descriptions.

    Args:
        filelist (list): List of json files to look through for job descriptions.

    Returns:
        counts (defaultdict): A defaultdict of a counter for the techs within each search term for our files from filelist.
    """
    counts = defaultdict(Counter)
    for file in filelist:
        with open(fr"data/{file}") as f:
            data = json.load(f)
            for key in list(data.keys()):
                if key.startswith("metadata"):
                    continue
                else:
                    for term in data[key]['terms']:
                        try:
                            for tech in data[key]['techs'][0].split("\n"):
                                clean_tech = tech.strip()
                                clean_tech = clean_tech.replace("-", "")
                                if tech in counts[term]:
                                    counts[term][clean_tech] += 1
                                else:
                                    counts[term][clean_tech] = 1
                        except IndexError:
                            # print(f"There were no techs in {key} (term: {term})")
                            continue

    return counts

In [4]:
def get_filelist(start_date, end_date, folder_path='data', start_str='p-raw'):
    """Get the json files which start with {start_str} in the {folder_path} within the date range (inclusive).

    Args:
        start_date (str): String start date in dd-mm-yy format - e.g. 11-09-23
        end_date (str): String end date in dd-mm-yy format - e.g. 17-10-23
        folder_path (str, optional): Folder path to search within. Defaults to 'data'.
        start_str (Str, optional): Starting string for files to pull. Defaults to 'p-raw'
    Returns:
        files_between_dates (list): List of strings, where each string is a filename from {folder_path} beginning with {start_str}
        and within [{start_date}, {end_date}]
    """
    start_date = datetime.strptime(start_date, "%d-%m-%y")
    end_date = datetime.strptime(end_date, "%d-%m-%y")
    
    files_between_dates = []
    
    for filename in os.listdir(folder_path):
        if filename.startswith(start_str):
            try:
                # Extract the date from the file name
                file_date = datetime.strptime(filename[11:19], "%d-%m-%y")
                if start_date <= file_date <= end_date:
                    files_between_dates.append(filename)
            except ValueError:
                # In case the date in the file name is not in the expected format
                pass
    
    return files_between_dates

In [5]:
filelist = get_filelist("11-09-20", "17-09-24")
ordered = order_techs(get_techs(filelist))

In [6]:
ordered

{'data science': OrderedDict([('python', 221),
              ('sql', 105),
              ('tableau', 36),
              ('r', 32),
              ('tensorflow', 32),
              ('uipath', 26),
              ('machine learning', 25),
              ('none', 24),
              ('excel', 22),
              ('llms', 20),
              ('aws', 19),
              ('java', 16),
              ('sas', 15),
              ('soc2', 14),
              ('power bi', 13),
              ('databricks', 10),
              ('pytorch', 10),
              ('apis', 10),
              ('cloud computing', 10),
              ('apache spark', 9),
              ('circle', 9),
              ('collaborating across departments', 9),
              ('generative ai', 8),
              ('nlp', 7),
              ('hadoop', 7),
              ('jira', 7),
              ('iot', 6),
              ('snowflake', 6),
              ('statistics', 6),
              ('salesforce', 5),
              ("bachelor's degree in computer

In [7]:
def print_top_n(ordered, n=5):
    """Prints the top {n} techs, by search term, from the input OrderedDict

    Args:
        ordered (dict): dict of an OrderedDict of techs by search term, usually created via order_techs()
        n (int, optional): Top # of techs to show per search term. Defaults to 5.
    """
    for key in ordered.keys():
        tops = list(ordered[key].items())[:n]
        print(f"Top {n} skills for {key}: {tops} \n")

In [8]:
print_top_n(ordered)

Top 5 skills for data science: [('python', 221), ('sql', 105), ('tableau', 36), ('r', 32), ('tensorflow', 32)] 

Top 5 skills for data analyst: [('sql', 75), ('systems development lifecycle cycle', 68), ('azure purview', 53), ('tableau', 47), ('none', 42)] 

Top 5 skills for data engineer: [('python', 62), ('aws', 43), ('sql', 33), ('java', 15), ('snowflake', 14)] 

Top 5 skills for machine learning engineer: [('cad/cam software', 250), ('python', 162), ('tensorflow', 43), ('none', 33), ('machine learning', 32)] 

Top 5 skills for mlops: [('aws', 49), ('python', 31), ('terraform', 23), ('llms', 22), ('java', 15)] 



In [9]:
# Should make this its own file or maybe an env var, read it in and add to it as needed
techs_to_remove = ["none", "systems development lifecycle cycle", "security clearance", "devops engineer"]

def clean_ordered_terms(ordered, techs_to_remove):

    ordcopy = copy.deepcopy(ordered)
    for k1, v1 in ordered.items():
        for k2, v2 in v1.items():
            if k2 in techs_to_remove:
                print(f"Removing tech: {k2}")
                del ordcopy[k1][k2]
            else:
                continue
    return ordcopy

In [10]:
cleaned_ord = clean_ordered_terms(ordered, techs_to_remove)

Removing tech: none
Removing tech: systems development lifecycle cycle
Removing tech: none
Removing tech: security clearance
Removing tech: none
Removing tech: none
Removing tech: devops engineer
Removing tech: none


In [11]:
print_top_n(cleaned_ord, 10)

Top 10 skills for data science: [('python', 221), ('sql', 105), ('tableau', 36), ('r', 32), ('tensorflow', 32), ('uipath', 26), ('machine learning', 25), ('excel', 22), ('llms', 20), ('aws', 19)] 

Top 10 skills for data analyst: [('sql', 75), ('azure purview', 53), ('tableau', 47), ('excel', 41), ('jira', 23), ('alteryx', 20), ('power bi', 16), ('ms excel', 14), ('salesforce', 13), ('microsoft excel', 12)] 

Top 10 skills for data engineer: [('python', 62), ('aws', 43), ('sql', 33), ('java', 15), ('snowflake', 14), ('databricks', 10), ('vmware vsphere', 10), ('data modelling', 9), ('spark', 9), ('circle', 9)] 

Top 10 skills for machine learning engineer: [('cad/cam software', 250), ('python', 162), ('tensorflow', 43), ('machine learning', 32), ('controllers (plcs)', 31), ('sql', 26), ('java', 24), ('llms', 21), ('aws', 13), ('c++', 13)] 

Top 10 skills for mlops: [('aws', 49), ('python', 31), ('terraform', 23), ('llms', 22), ('java', 15), ('docker', 11), ('jenkins', 10), ('kubernetes

# Now must add to each p-raw.json
Need to keep the original techs, add a 'cleaned_techs' or something as a new key with the techs we don't want removed.

Also want to create some rules for techs that can be allocated together e.g. azure cloud, azure synapse, etc. can all be mapped to 'azure'.
Use re to match the words, be careful of partial matches.  

The above cleaning should be added to clean_ordered_terms as well (or maybe just there, and use those results to update the .jsons, think on it)

In [12]:
# First lets try to do the mapping

In [13]:
def map_term(tech, term_mapping):
    for key, mapped_terms in term_mapping.items():
        for mapped_term in mapped_terms:
            if re.match(r'\b' + re.escape(mapped_term) + r'\b', tech, re.I):
                return key
    return tech

def should_remove_partial(tech, to_remove):
    tech_lower = tech.lower()
    for keyword in to_remove:
        if re.search(r'\b' + re.escape(keyword) + r'\b', tech_lower, re.I):
            return True
    return False

def should_remove_exact(tech, to_remove):
    tech_lower = tech.lower()
    for keyword in to_remove:
        if tech_lower == keyword.lower():
            return True
    return False

def combine_and_remove_technologies(tech_dict, to_remove_partial, to_remove_exact, term_mapping, min_count=5):
    combined_dict = {}

    for job_title, tech_ordered_dict in tech_dict.items():
        tech_list = list(tech_ordered_dict.items())
        tech_counts = {}

        for tech, count in tech_list:
            # Remove extra whitespace around tech
            tech_to_add = tech.strip()

            # Check if the technology should be removed due to partial match
            if should_remove_partial(tech_to_add, to_remove_partial):
                continue

            # Check if the technology should be removed due to exact match
            if should_remove_exact(tech_to_add, to_remove_exact):
                continue

            # Map the technology based on the term_mapping dictionary
            tech_to_add = map_term(tech_to_add, term_mapping)

            # Combine counts for the same technology
            if tech_to_add in tech_counts:
                tech_counts[tech_to_add] += count
            else:
                tech_counts[tech_to_add] = count

        # Filter out technology terms with counts less than min_count
        filtered_tech_counts = {tech: count for tech, count in tech_counts.items() if count >= min_count}

        # Convert filtered_tech_counts to an OrderedDict
        new_tech_ordered_dict = OrderedDict(sorted(filtered_tech_counts.items(), key=lambda item: item[1], reverse=True))

        combined_dict[job_title] = new_tech_ordered_dict

    return combined_dict

term_mapping and techs_to_remove will be moved to either their own file or a .env for easier manipulation

In [14]:
to_remove_partial = ['analyst', 'documentation', 'applications', 'collaborate with', 'lead data mining and collection', 'data profiling', 'proprietary', 'security', 'ability to', 'regulatory', 'translate business', 'exploration', 'p.h.d', 'ph.d', 'phd', 'best practices', 'bachelor', 'bachelors', 'degree', 'lifecycle', 'security clearance', 'collaborating']
to_remove_exact = ['prototyping tools', 'analytical tools', 'data modeling', 'reserving theories', 'verizon', 'deep learning', 'sql queries', 'product owner', 'data processing', 'machine learning models', 'analytics', 'none', 'devops engineer', 'machine learning', 'machine learning engineer', 'data analysis', 'statistical tools', 'data engineering']

In [15]:
term_mapping = {
    'aws': ['amazon', 'aws'],
    'azure': ['azure'],
    'llm': ['llm', 'llms'],
    'ai': ['ai', 'artificial intelligence', 'contextual ai'],
    'python': ['python'],
    'powerbi': ['ms power bi', 'ms powerbi', 'power bi', 'powerbi', 'microsoft power bi', 'microsoft powerbi'],
    'sas': ['sas'],
    'gcp': ['gcp', 'google cloud'],
    'excel': ['microsoft excel', 'excel', 'ms excel'],
    'microsoft': ['ms office', 'microsoft word', 'ms word', 'microsoft office'], #don't add microsoft itself, as that would eat up power bi stuff
    'nlp': ['nlp', 'natural language processing'],
    'edm': ['edm'],
    'adobe': ['adobe'],
    'labelbox': ['labelbox'],
    'nosql': ['nosql'],
    'mysql': ['mysql'],
    'postgresql': ['postgresql'],
    'sql': ['sql'],
    'google': ['google suite', 'google docs'],
    'oracle': ['oracle']
}

In [16]:
result = combine_and_remove_technologies(ordered, to_remove_partial, to_remove_exact, term_mapping)

Test the mapping below

In [17]:
map_term('ai contextual', term_mapping)

'ai'

In [18]:
result

{'data science': OrderedDict([('python', 235),
              ('sql', 117),
              ('tableau', 36),
              ('r', 32),
              ('tensorflow', 32),
              ('llm', 30),
              ('aws', 27),
              ('uipath', 26),
              ('excel', 26),
              ('powerbi', 26),
              ('sas', 21),
              ('java', 17),
              ('ai', 17),
              ('soc2', 14),
              ('azure', 14),
              ('microsoft', 13),
              ('nlp', 12),
              ('databricks', 10),
              ('pytorch', 10),
              ('apis', 10),
              ('cloud computing', 10),
              ('apache spark', 9),
              ('circle', 9),
              ('generative ai', 8),
              ('gcp', 8),
              ('hadoop', 7),
              ('jira', 7),
              ('iot', 6),
              ('snowflake', 6),
              ('statistics', 6),
              ('adobe', 6),
              ('salesforce', 5),
              ('macros', 5)

# Annotate job descriptions based on our cleaned techs
These will be used for a tensorflow model

In [19]:
# Get a list of all the techs from above which we will use to help annotate our job descriptions.
all_tech_terms = set()
for job_title, tech_counts in result.items():
    for tech_term in tech_counts.keys():
        all_tech_terms.add(tech_term)

tech_list = list(all_tech_terms)

In [20]:
def annotate_jds(folder_path='data', entity_label=1, term_mapping=term_mapping, tech_list=tech_list):
    """Label each tokenized word with a 0 or 1.  0 for non-tech, 1 for tech.

    Args:
        folder_path (str, optional): _description_. Defaults to 'data'.
        entity_label (int, optional): _description_. Defaults to 1.
        term_mapping (_type_, optional): _description_. Defaults to term_mapping.
        tech_list (_type_, optional): _description_. Defaults to tech_list.

    Returns:
        _type_: _description_
    """
    # Load the English language model from spaCy
    nlp = spacy.load("en_core_web_sm")

    # Initialize lists to store tokenized text and corresponding labels
    all_tokenized_text = []
    all_labels = []

    # Iterate through JSON files in the directory
    for file in os.listdir(folder_path):
        if file.startswith('p-raw'):
            try:
                with open(os.path.join(folder_path, file)) as f:
                    data = json.load(f)

                for key in data.keys():
                    if key.startswith('metadata'):
                        continue

                    text = data[key]['cleaned_desc']

                    # Tokenize text using spaCy
                    doc = nlp(text)

                    # Initialize lists to store token annotations and labels
                    token_annotations = []
                    token_labels = []

                    # Annotate tech terms with mapped terms
                    for token in doc:
                        normalized_token = token.text.lower()
                        for tech_term in tech_list:
                            if normalized_token == tech_term.lower():
                                token_annotations.append(token.text)
                                token_labels.append(entity_label)
                                break
                        else:
                            for mapped_term, mapped_values in term_mapping.items():
                                if normalized_token in mapped_values:
                                    token_annotations.append(token.text)
                                    token_labels.append(entity_label)
                                    break
                            else:
                                token_annotations.append(token.text)
                                token_labels.append(0)  # 0 for non-entities

                    # Append tokenized text and labels to the respective lists
                    all_tokenized_text.append(token_annotations)
                    all_labels.append(token_labels)

            except FileNotFoundError:
                print(f"File not found: {os.path.join(folder_path, file)}")
            except Exception as e:
                print(f"An error occurred while processing {os.path.join(folder_path, file)}: {e}")

    return all_tokenized_text, all_labels

In [21]:
all_tokenized_text, all_labels = annotate_jds()

In [22]:
for x, y in zip(all_tokenized_text[0], all_labels[0]):
    if y == "TECHNOLOGY":
        print(x, y)

In [23]:
data = {
    "words": all_tokenized_text,
    "labels": all_labels
}

custom_dataset = Dataset.from_dict(data)

# split dataset into training/testing subsets
custom_dataset = custom_dataset.train_test_split(test_size=0.2, seed=523)

Note that bert-base-cased has a max seq length of 512, and we have many paragraphs with more tokens.  
We will attempt to fix this by splitting the text into smaller chunks.

In [24]:
custom_dataset

DatasetDict({
    train: Dataset({
        features: ['words', 'labels'],
        num_rows: 4344
    })
    test: Dataset({
        features: ['words', 'labels'],
        num_rows: 1086
    })
})

In [25]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [26]:
inputs = tokenizer(custom_dataset["train"][0]["words"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'Re',
 '##quire',
 '##d',
 'Skills',
 ':',
 'Infrastructure',
 '-',
 'as',
 '-',
 'Code',
 'experience',
 '(',
 'e',
 '.',
 'g',
 '.',
 ',',
 'Python',
 ',',
 'Ba',
 '##sh',
 ',',
 'Y',
 '##AM',
 '##L',
 ',',
 'J',
 '##SO',
 '##N',
 ',',
 'Cloud',
 '##F',
 '##orm',
 '##ation',
 ',',
 'An',
 '##sible',
 ',',
 'Terra',
 '##form',
 ')',
 'A',
 '##WS',
 'or',
 'other',
 'cloud',
 'experience',
 'Ku',
 '##ber',
 '##net',
 '##es',
 'management',
 'and',
 'configuration',
 'experience',
 'Experience',
 'with',
 'the',
 'Atlas',
 '##sian',
 'product',
 'suite',
 '(',
 'Con',
 '##f',
 '##lue',
 '##nce',
 ',',
 'Ji',
 '##ra',
 ')',
 ',',
 'v',
 '##S',
 '##phere',
 ',',
 'G',
 '##it',
 '##la',
 '##b',
 'and',
 '/',
 'or',
 'Son',
 '##ar',
 '##Q',
 '##ube',
 'Top',
 'Secret',
 'security',
 'clearance',
 'with',
 'required',
 ',',
 'preferred',
 'T',
 '##S',
 '/',
 'SC',
 '##I',
 'w',
 '/',
 'p',
 '##oly',
 'Bachelor',
 'of',
 'Science',
 'degree',
 'in',
 'Computer',
 'Science',
 ',',
 

In [27]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        else:
            # Start of a new word!
            current_word = word_id
            new_labels.append(labels[word_id])

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [28]:
tokenized_datasets = custom_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4344 [00:00<?, ? examples/s]

Map: 100%|██████████| 4344/4344 [00:03<00:00, 1312.03 examples/s]
Map: 100%|██████████| 1086/1086 [00:00<00:00, 1409.90 examples/s]


In [29]:
# data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) # for PyTorch
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") # for TensorFlow