In [18]:
import pandas as pd
import json
import boto3
import os

## Get a local file to inspect

In [2]:
s3 = boto3.client('s3')

In [44]:
s3_bucket_name = 'gpt-bucket-indeed'
filename_s3 = "data/indeed_28_03_2024.json"
local_filename = fr"data\raw\{filename_s3.split('.')[0].split('/')[1]}_local.json"
print(f"{s3_bucket_name},\n{filename_s3},\n{local_filename}")

gpt-bucket-indeed,
data/indeed_28_03_2024.json,
data\raw\indeed_28_03_2024_local.json


In [45]:
s3.download_file(s3_bucket_name,filename_s3, local_filename)

In [23]:
df = pd.read_json(local_filename, lines=True)

In [25]:
df.columns

Index(['job_key', 'location', 'keyword', 'from_age', 'page', 'position',
       'salary_min', 'salary_max', 'salary_type', 'salary_estimated_flag',
       'job_description', 'company', 'job_title', 'url', 'split_jd', 'id',
       'object', 'created', 'model', 'choices', 'usage', 'system_fingerprint',
       'cleaned_techs'],
      dtype='object')

In [29]:
df.iloc[1].job_description

"Unit Description: \n   Sodexo  has an exciting new opportunity for a  Senior Safety Data Analyst  to support our operations in North America. \n \n  Be part of the Tech and Services Food Safety, Health, Safety and Environment team for Sodexo North America reporting to Director of Safety Systems and Tools. \n \n  This mid management position is  remote  and may require travel up to 15% of the time, primarily within the states with some overnight stays required. \n \n  The Senior Safety Data Analyst designs, builds, and maintains the HSE Tech and Services business and analytics architecture. The Senior Safety Data Analyst collaborates with Segment, HSE, and Food Safety Teams to further the understanding and identification of causal factors related to Safety (HSE and Food) Events including relationships between the business climate and event frequency. Will lead the framing and scoping of the relevant use cases leveraged to direct analyses. Work with business stakeholders to identify the

## New tokenizer
Need new tokenizer as we are in a new domain from Bert

In [41]:
import numpy as np
import pandas as pd
from transformers import BertTokenizerFast, AutoTokenizer

In [36]:
tokenizer = BertTokenizerFast.from_pretrained(
    'huggingface-course/bert-base-uncased-tokenizer-without-normalizer'
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [40]:
teststr = df.iloc[0].job_description
print(tokenizer.tokenize(teststr[:55]))
print(teststr[:55])

['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', 'reports', 'to']
Job Description Summary  The Data Analyst II reports to


1. Gather corpus of texts

We have our json files filled with job descriptions

2. Choose tokenizer architecture

We will use BERT

3. Train tokenizer on corpus



4. Save result

## We require our corpus as a dataset

In [101]:
from datasets import Dataset
import os
import nltk
import re
nltk.download('punkt') # use for splitting data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### We will use the Dataset.from_pandas() function
First need to create our pandas dataframe wiht text and labels columns

Note DistilBERT can tokenize up to 512 (max_len)

In [49]:
label_names = ['O', 'B_TECH', 'I-TECH'] # Either not a tech, beginning tech, or inner-tech
num_classes = 3
max_len = 512

In [72]:
def split_text_into_chunks(text, chunk_size):
    tokens = nltk.word_tokenize(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk = ' '.join(tokens[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

In [84]:
def create_new_dataframe(df, chunk_size, column_name='split_jd'):
    chunks_list = []
    for text in df[column_name]:
        chunks = split_text_into_chunks(text, chunk_size)
        chunks_list.extend(chunks)
    new_df = pd.DataFrame({'text': chunks_list})
    new_df['labels'] = ''
    return new_df

In [88]:
split_df = create_new_dataframe(df, 50)
split_df.head()

Unnamed: 0,text,labels
0,Work Shift Day ( United States of America ) Jo...,
1,skills . Excellent communication and presentat...,
2,and completeness . Utilize statistical methods...,
3,Lead the framing and scoping of the relevant u...,
4,business climate and event frequency . Impleme...,


In [89]:
split_df.to_excel('data/raw/to_label.xlsx', index=False)

## Check cleanest data

In [90]:
s3_bucket_name = 'clean-gpt-bucket-indeed'
filename_s3 = "data/indeed_28_03_2024.json"
local_filename = fr"data\raw\clean_{filename_s3.split('.')[0].split('/')[1]}_local.json"
print(f"{s3_bucket_name},\n{filename_s3},\n{local_filename}")
s3.download_file(s3_bucket_name,filename_s3, local_filename)

clean-gpt-bucket-indeed,
data/indeed_28_03_2024.json,
data\raw\clean_indeed_28_03_2024_local.json


In [91]:
clean_df = pd.read_json(local_filename, lines=True)
clean_df['labels'] = ''
small_df = clean_df[['split_jd', 'cleaned_techs', 'labels']]

In [100]:
small_df.head()

Unnamed: 0,split_jd,cleaned_techs,labels
0,High proficiency in a procedural programming ...,[python],
1,"Collaborate with product, engineering, and d...","[machine learning, code, agile development]",
2,Must be eligible to work in the United State...,[],
3,Strong experience in modeling frameworks such...,"[UML, ArchiMate]",
4,Build trust with the team to perform assigned...,[microsoft tool suite],


In [102]:
def assign_labels_to_tokens(tokens, techs):
    # Initialize labels with "O" for all tokens
    labels = ["O"] * len(tokens)

    # Create a regex pattern to match full words in lowercase
    tech_pattern = re.compile(r'\b(?:' + '|'.join(re.escape(tech) for tech in techs) + r')\b', re.I)

    # Join the tokens into a space-separated string
    text = " ".join(tokens).lower()  # Convert text to lowercase

    # Find all tech entity matches in the lowercase text using regex
    tech_matches = tech_pattern.finditer(text)

    for match in tech_matches:
        # print(match)  # Print the match object
        matched_text = match.group()  # Get the matched text
        # print(matched_text)  # Print the matched text

        # Split the matched text into individual tokens
        matched_tokens = matched_text.split()

        # Find the index of the first token that matches the text
        start_token = None
        for i, token in enumerate(tokens):
            if token.lower() == matched_tokens[0]:
                start_token = i
                break

        if start_token is not None:
            # Label the first token as "B-TEC" (beginning of tech entity)
            labels[start_token] = "B-TEC"
            # Label the subsequent tokens as "I-TEC" (inside of tech entity)
            for i in range(start_token + 1, start_token + len(matched_tokens)):
                labels[i] = "I-TEC"

    return labels

In [119]:
test_ds = {'words':[], 'labels':[]} # just words not tokens for test, must tokenize in final iteration
with open(fr"{local_filename}") as f:
    for line in f:
        data = json.loads(line)
        tech_entities = data['cleaned_techs']
        if len(tech_entities) < 1:
            continue
        else:
            job_desc_clean = data['split_jd']
            labels = assign_labels_to_tokens(job_desc_clean, tech_entities)
            test_ds['words'].append(job_desc_clean)
            test_ds['labels'].append(labels)

In [122]:
test_ds

{'words': [' High proficiency in a procedural programming language (e.g. Python). Strong communication skills with the ability to distill complex concepts into understandable insights for non-technical stakeholders. Strong leadership and mentorship skills and experience, with a passion for guiding and developing other team members. ',
  "  Collaborate with product, engineering, and data teams to identify machine learning opportunities Evaluate and improve the performance of machine learning models Stay up-to-date with the latest developments in machine learning research and apply them to real-world problems Write clean, scalable, and maintainable code Communicate technical concepts and ideas to both technical and non-technical stakeholders Sharing your knowledge by giving brown bag sessions, tech talks, and evangelising appropriate tech and engineering best practices. Mentoring other team members, facilitating within/across team workshops, and leading agile development. Qualifications: