Custom tokenizer on tech descs
Train a tokenizer from scratch with hf (autotokenizer?)

Semantic similarity with words, to create a dict based NER
Levenstein(sp?) distiance for matching to known entities as opposed to semantic distance

Let list of techs from openai do most the work for finding techs


### Make sure to use labels from after check_results cleaning

In [125]:
from transformers import AutoTokenizer
import spacy
import json
from datetime import datetime
import os
import re


from utils import get_filelist

# Create custom tokenizer
Tokenizers used were suboptimal, create my own instead:
https://huggingface.co/learn/nlp-course/chapter6/8?fw=pt

Normalization, pre-tokenization, model, post-processing, decoding

1) Gather a corpus
2) Create a backend_tokenizer with huggingface tokenizer library
3) Load the backend_tokenizer in a huggingface transformers tokenizer

Transform our dataset into an iterator of lists of texts.

In [85]:
filelist = get_filelist('01-01-20', '11-09-25')
# filelist

In [71]:
def get_raw_data(filelist, datapath='data'):
    for file in filelist:
        with open(f"{datapath}/{file}", 'r') as f:
            data = json.load(f)
            for job_id, job_data in data.items():
                job_desc = job_data.get("desc", "")
                yield job_desc
    

In [72]:
training_corpus = get_raw_data(filelist)

In [73]:
# for desc in training_corpus:
#     print(desc)

In [88]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [89]:
from transformers import AutoTokenizer

In [75]:
# Get bert base NER special tokens
old_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-ner")

In [91]:
# Train the new tokenizer.  Takes around 4 seconds with data from 09-09-23 to 28-10-23
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

In [79]:
# Example with problems with base tokenizer.  Would fail to find all of python, pytorch, llama 2
example = "Meta is seeking a Research Engineer to join our Large Language Model (LLM) Research team. We conduct focused research and engineering to build state-of-the-art LLMs, which we often open-source, like our team’s recent Llama 2.  Adapt standard machine learning methods to best exploit modern parallel environments (e.g. distributed clusters, multicore SMP, and GPU). Programming experience in Python and hands-on experience with frameworks such as PyTorch. \n  Exposure to architectural patterns of large scale software applications. Direct experience in generative AI and LLM research."

In [83]:
# # It now properly tokenizes python and pytorch.  llama 2 still separated as 'llama' and '2', but no longer 'll', '##a', '##ma', '2'.
# tokens = tokenizer.tokenize(example)
# tokens

In [92]:
# # Save the tokenizer, label file with the dates used for training data
# tokenizer.save_pretrained("data/ner_models/huggingface_models/tokenizer_09-09-23_28-10-23")
# # Load the tokenizer back in when we need it
# tokenizer = AutoTokenizer.from_pretrained("data/ner_models/huggingface_models/tokenizer_09-09-23_28-10-23/")

In [95]:
# tokenizer.tokenize(example)

# Assigning labels to tokens with our trained tokenizer

In [99]:
# Load our saved tokenizer trained on our job corpus
tokenizer = AutoTokenizer.from_pretrained("data/ner_models/huggingface_models/tokenizer_09-09-23_28-10-23/")

In [100]:
text = "Meta is seeking a Research Engineer to join our Large Language Model (LLM) Research team. We conduct focused research and engineering to build state-of-the-art LLMs, which we often open-source, like our team’s recent Llama 2.  Adapt standard machine learning methods to best exploit modern parallel environments (e.g. distributed clusters, multicore SMP, and GPU). Programming experience in Python and hands-on experience with frameworks such as PyTorch. \n  Exposure to architectural patterns of large scale software applications. Direct experience in generative AI and LLM research."

In [165]:
token_test = tokenizer.tokenize(text)
token_test

['Meta',
 'is',
 'seeking',
 'a',
 'Research',
 'Engineer',
 'to',
 'join',
 'our',
 'Large',
 'Language',
 'Model',
 '(',
 'LLM',
 ')',
 'Research',
 'team',
 '.',
 'We',
 'conduct',
 'focused',
 'research',
 'and',
 'engineering',
 'to',
 'build',
 'state',
 '-',
 'of',
 '-',
 'the',
 '-',
 'art',
 'LLMs',
 ',',
 'which',
 'we',
 'often',
 'open',
 '-',
 'source',
 ',',
 'like',
 'our',
 'team',
 '’',
 's',
 'recent',
 'Llama',
 '2',
 '.',
 'Adapt',
 'standard',
 'machine',
 'learning',
 'methods',
 'to',
 'best',
 'exploit',
 'modern',
 'parallel',
 'environments',
 '(',
 'e',
 '.',
 'g',
 '.',
 'distributed',
 'clusters',
 ',',
 'multicore',
 'SMP',
 ',',
 'and',
 'GPU',
 ')',
 '.',
 'Programming',
 'experience',
 'in',
 'Python',
 'and',
 'hands',
 '-',
 'on',
 'experience',
 'with',
 'frameworks',
 'such',
 'as',
 'PyTorch',
 '.',
 'Exposure',
 'to',
 'architectural',
 'patterns',
 'of',
 'large',
 'scale',
 'software',
 'applications',
 '.',
 'Direct',
 'experience',
 'in',
 'ge

In [281]:
def assign_labels_to_tokens(tokens, techs):
    # Initialize labels with "O" for all tokens
    labels = ["O"] * len(tokens)

    # Create a regex pattern to match full words in lowercase
    tech_pattern = re.compile(r'\b(?:' + '|'.join(re.escape(tech) for tech in techs) + r')\b', re.I)

    # Join the tokens into a space-separated string
    text = " ".join(tokens).lower()  # Convert text to lowercase

    # Find all tech entity matches in the lowercase text using regex
    tech_matches = tech_pattern.finditer(text)

    for match in tech_matches:
        # print(match)  # Print the match object
        matched_text = match.group()  # Get the matched text
        # print(matched_text)  # Print the matched text

        # Split the matched text into individual tokens
        matched_tokens = matched_text.split()

        # Find the index of the first token that matches the text
        start_token = None
        for i, token in enumerate(tokens):
            if token.lower() == matched_tokens[0]:
                start_token = i
                break

        if start_token is not None:
            # Label the first token as "B-TEC" (beginning of tech entity)
            labels[start_token] = "B-TEC"
            # Label the subsequent tokens as "I-TEC" (inside of tech entity)
            for i in range(start_token + 1, start_token + len(matched_tokens)):
                labels[i] = "I-TEC"

    return labels


Where non-techs are 'O', starting tech words are 'B-TEC', and inside-of-tech words are 'I-TEC' e.g:
'require' would be 'O', 'python' would be 'B-TEC', and 'power bi' would split in to 'power:'B-TEC', 'bi':'I-TEC' to show it's part of the same tech, but inside the word/token.

Test it with one of my example texts from Meta

In [282]:
labels = assign_labels_to_tokens(token_test, ['llama 2', 'python', 'pytorch'])

In [287]:
# Works!
[(x, i, token_test[i]) for i, x in enumerate(labels) if x != "O"]

[('B-TEC', 48, 'Llama'),
 ('I-TEC', 49, '2'),
 ('B-TEC', 80, 'Python'),
 ('B-TEC', 90, 'PyTorch')]

Test again with a second example text

In [284]:
# "python, spark, sql, snowflake, tableau, airflow, aws, azure, power bi"
text2 = "Proficiency in Python for data manipulation and analysis \n Prior experience in working with Spark SQL for data processing and querying \n Hands-on experience with Snowflake ETL processes and data integration \n Superior expertise in creating interactive dashboards and reports to present findings and key performance indicators (KPIs) using Tableau    \n Basic understanding of Airflow for simple job orchestration. Preferred Qualifications: \n \n Experience with cloud platforms like AWS, Azure \n Knowledge of statistical analysis and machine learning concepts \n Familiarity with data modeling and database design \n Familiarity with data visualization tools other than Tableau (e.g., Power BI) \n \n"
token_test2 = tokenizer.tokenize(text2)

In [285]:
labels2 = assign_labels_to_tokens(token_test2, ["python", "spark", "sql", "snowflake", "tableau", "airflow", "aws", "azure", "power bi"])

In [286]:
[(x, i, token_test2[i]) for i, x in enumerate(labels2) if x != "O"]

[('B-TEC', 2, 'Python'),
 ('B-TEC', 13, 'Spark'),
 ('B-TEC', 14, 'SQL'),
 ('B-TEC', 25, 'Snowflake'),
 ('B-TEC', 50, 'Tableau'),
 ('B-TEC', 54, 'Airflow'),
 ('B-TEC', 68, 'AWS'),
 ('B-TEC', 70, 'Azure'),
 ('B-TEC', 100, 'Power'),
 ('I-TEC', 101, 'BI')]

Max seq length for BERT models is 512.  We will just truncate after 512 for now, can try segmentation or sliding windows later.

In [7]:
filelist = get_filelist("11-09-20", "17-09-24", folder_path="data", start_str="p-raw")

In [10]:
short_filelist = filelist[:2]

In [15]:
def create_dataset_from_json(filelist, max_seq_len=512):
    dataset = {"tokens": [], "labels":[]}
    
    for file in filelist:
        with open(fr"data/{file}") as f:
            data = json.load(f)
            for job_key, job_data in data.items():
                if job_key.startswith("meta"):
                    continue
                else:
                    tech_entities = job_data.get("techs", [])
                    if len(tech_entities) < 1: # No techs in this job desc
                        continue
                    else:
                        job_desc_clean = job_data["cleaned_desc"]

                        tokens = tokenizer.tokenize(job_desc_clean)
                        
                        if len(tokens) > max_seq_len:
                            tokens = tokens[:max_seq_len]
                        labels = assign_labels_to_tokens(tokens, tech_entities)
                        
                        dataset["tokens"].append(tokens)
                        dataset["labels"].append(labels)
    return dataset

In [16]:
create_dataset_from_json(short_filelist)

{'tokens': [['possesses',
   'either',
   'an',
   'undergraduate',
   'or',
   'master',
   "'",
   's',
   'degree',
   'in',
   'a',
   'quantitative',
   'field',
   '(',
   'e',
   '.',
   'g',
   '.',
   'mathematics',
   ',',
   'finance',
   ',',
   'statistics',
   ',',
   'or',
   'similar',
   ')',
   'or',
   'confirmed',
   'experience',
   'within',
   'data',
   'science',
   'and',
   'analytics',
   '3',
   '+',
   'years',
   'of',
   'work',
   'experience',
   'involving',
   'quantitative',
   'data',
   'analysis',
   'and',
   'complex',
   'problem',
   'solving',
   'excellent',
   'communication',
   'skills',
   'with',
   'the',
   'ability',
   'to',
   'di',
   '##sti',
   '##ll',
   'complex',
   'issues',
   'and',
   'detailed',
   'analysis',
   'into',
   'simple',
   ',',
   'structured',
   'framework',
   '##s',
   'with',
   'concrete',
   'action',
   'plans',
   'experience',
   'building',
   'statistical',
   'models',
   'to',
   'yield',
   