In [1]:
import json
import os
from collections import Counter, defaultdict, OrderedDict
import copy
from datetime import datetime

In [13]:
def order_techs(counter):
    """Order the techs in our tech counter by number of occurances, by search term.

    Args:
        counter (defaultdict): defaultdict of our Counter for techs by search term

    Returns:
        ord (dict): dict containing an OrderedDict of techs by search term
    """
    ord = {k: OrderedDict(sorted(v.items(), key=lambda x: float(x[1]), reverse=True)) for k, v in counter.items()}
    return ord

In [14]:
def get_techs(filelist):
    """Get the name of and count of techs, by term, from our job descriptions.

    Args:
        filelist (list): List of json files to look through for job descriptions.

    Returns:
        counts (defaultdict): A defaultdict of a counter for the techs within each search term for our files from filelist.
    """
    counts = defaultdict(Counter)
    for file in filelist:
        with open(fr"data/{file}") as f:
            data = json.load(f)
            for key in list(data.keys()):
                if key.startswith("metadata"):
                    continue
                else:
                    for term in data[key]['terms']:
                        try:
                            for tech in data[key]['techs'][0].split("\n"):
                                clean_tech = tech.strip()
                                clean_tech = clean_tech.replace("-", "")
                                if tech in counts[term]:
                                    counts[term][clean_tech] += 1
                                else:
                                    counts[term][clean_tech] = 1
                        except IndexError:
                            # print(f"There were no techs in {key} (term: {term})")
                            continue

    return counts

In [25]:
def get_filelist(start_date, end_date, folder_path='data', start_str='p-raw'):
    """Get the json files which start with {start_str} in the {folder_path} within the date range (inclusive).

    Args:
        start_date (str): String start date in dd-mm-yy format - e.g. 11-09-23
        end_date (str): String end date in dd-mm-yy format - e.g. 17-10-23
        folder_path (str, optional): Folder path to search within. Defaults to 'data'.
        start_str (Str, optional): Starting string for files to pull. Defaults to 'p-raw'
    Returns:
        files_between_dates (list): List of strings, where each string is a filename from {folder_path} beginning with {start_str}
        and within [{start_date}, {end_date}]
    """
    start_date = datetime.strptime(start_date, "%d-%m-%y")
    end_date = datetime.strptime(end_date, "%d-%m-%y")
    
    files_between_dates = []
    
    for filename in os.listdir(folder_path):
        if filename.startswith(start_str):
            try:
                # Extract the date from the file name
                file_date = datetime.strptime(filename[11:19], "%d-%m-%y")
                if start_date <= file_date <= end_date:
                    files_between_dates.append(filename)
            except ValueError:
                # In case the date in the file name is not in the expected format
                pass
    
    return files_between_dates

In [26]:
filelist = get_filelist("11-09-20", "17-09-24")
ordered = order_techs(get_techs(filelist))

In [27]:
ordered

{'data science': OrderedDict([('python', 205),
              ('sql', 94),
              ('tableau', 32),
              ('tensorflow', 31),
              ('r', 27),
              ('uipath', 26),
              ('none', 22),
              ('llms', 20),
              ('excel', 20),
              ('aws', 19),
              ('machine learning', 17),
              ('sas', 15),
              ('soc2', 14),
              ('power bi', 13),
              ('java', 12),
              ('pytorch', 10),
              ('apis', 10),
              ('databricks', 9),
              ('apache spark', 9),
              ('circle', 9),
              ('collaborating across departments', 9),
              ('generative ai', 8),
              ('hadoop', 7),
              ('cloud computing', 7),
              ('nlp', 6),
              ('jira', 6),
              ('iot', 6),
              ('statistics', 6),
              ('llms and fms', 5),
              ('macros', 5),
              ('artificial intelligence', 5),
   

In [29]:
def print_top_n(ordered, n=5):
    """Prints the top {n} techs, by search term, from the input OrderedDict

    Args:
        ordered (dict): dict of an OrderedDict of techs by search term, usually created via order_techs()
        n (int, optional): Top # of techs to show per search term. Defaults to 5.
    """
    for key in ordered.keys():
        tops = list(ordered[key].items())[:n]
        print(f"Top {n} skills for {key}: {tops} \n")

In [30]:
print_top_n(ordered)

Top 5 skills for data science: [('python', 205), ('sql', 94), ('tableau', 32), ('tensorflow', 31), ('r', 27)] 

Top 5 skills for data analyst: [('systems development lifecycle cycle', 68), ('sql', 66), ('azure purview', 53), ('tableau', 43), ('excel', 38)] 

Top 5 skills for data engineer: [('python', 61), ('aws', 42), ('sql', 30), ('java', 14), ('security clearance', 12)] 

Top 5 skills for machine learning engineer: [('cad/cam software', 250), ('python', 157), ('tensorflow', 42), ('none', 31), ('controllers (plcs)', 31)] 

Top 5 skills for mlops: [('aws', 48), ('python', 28), ('terraform', 22), ('llms', 22), ('java', 12)] 



In [31]:
# Should make this its own file or maybe an env var, read it in and add to it as needed
techs_to_remove = ["none", "systems development lifecycle cycle", "security clearance", "devops engineer"]

def clean_ordered_terms(ordered, techs_to_remove):

    ordcopy = copy.deepcopy(ordered)
    for k1, v1 in ordered.items():
        for k2, v2 in v1.items():
            if k2 in techs_to_remove:
                print(f"Removing tech: {k2}")
                del ordcopy[k1][k2]
            else:
                continue
    return ordcopy

In [32]:
cleaned_ord = clean_ordered_terms(ordered, techs_to_remove)

Removing tech: none
Removing tech: systems development lifecycle cycle
Removing tech: none
Removing tech: security clearance
Removing tech: none
Removing tech: none
Removing tech: devops engineer
Removing tech: none


In [33]:
print_top_n(cleaned_ord, 10)

Top 10 skills for data science: [('python', 205), ('sql', 94), ('tableau', 32), ('tensorflow', 31), ('r', 27), ('uipath', 26), ('llms', 20), ('excel', 20), ('aws', 19), ('machine learning', 17)] 

Top 10 skills for data analyst: [('sql', 66), ('azure purview', 53), ('tableau', 43), ('excel', 38), ('jira', 20), ('alteryx', 20), ('power bi', 16), ('ms excel', 13), ('salesforce', 12), ('microsoft excel', 11)] 

Top 10 skills for data engineer: [('python', 61), ('aws', 42), ('sql', 30), ('java', 14), ('snowflake', 10), ('vmware vsphere', 10), ('databricks', 9), ('data modelling', 9), ('spark', 9), ('circle', 9)] 

Top 10 skills for machine learning engineer: [('cad/cam software', 250), ('python', 157), ('tensorflow', 42), ('controllers (plcs)', 31), ('machine learning', 28), ('sql', 25), ('llms', 21), ('java', 18), ('aws', 13), ('c++', 13)] 

Top 10 skills for mlops: [('aws', 48), ('python', 28), ('terraform', 22), ('llms', 22), ('java', 12), ('docker', 10), ('kubernetes', 9), ('jenkins', 

# Now must add to each p-raw.json
Need to keep the original techs, add a 'cleaned_techs' or something as a new key with the techs we don't want removed.

Also want to create some rules for techs that can be allocated together e.g. azure cloud, azure synapse, etc. can all be mapped to 'azure'.
Use re to match the words, be careful of partial matches.  

The above cleaning should be added to clean_ordered_terms as well (or maybe just there, and use those results to update the .jsons, think on it)

In [51]:
# First lets try to do the mapping

In [55]:
def map_term(tech, term_mapping):
    """
    Map a technology term to its corresponding key in the term mapping.

    Args:
        tech (str): The technology term to map.
        term_mapping (dict): A dictionary that specifies how terms should be mapped to other terms.

    Returns:
        str: The mapped technology term.
    """
    for key, mapped_terms in term_mapping.items():
        for mapped_term in mapped_terms:
            if mapped_term in tech:
                return key
    return tech

In [56]:
def combine_and_remove_technologies(tech_dict, to_remove, term_mapping):
    """
    Combine and remove technologies in a given dictionary, based on the provided rules.

    Args:
        tech_dict (dict): A dictionary where keys are job titles and values are OrderedDicts
            containing technology terms and their respective counts.
        to_remove (list): A list of technology terms to remove.
        term_mapping (dict): A dictionary that specifies how terms should be mapped to other terms.

    Returns:
        dict: A modified dictionary with combined and removed technologies.
    """
    combined_dict = {}

    for job_title, tech_ordered_dict in tech_dict.items():
        tech_list = list(tech_ordered_dict.items())
        tech_counts = {}

        for tech, count in tech_list:
            tech_to_add = tech

            # Check if the technology is in the 'to_remove' list
            if tech_to_add in to_remove:
                continue

            # Map the technology based on the term_mapping dictionary
            tech_to_add = map_term(tech_to_add, term_mapping)

            # Combine counts for the same technology
            if tech_to_add in tech_counts:
                tech_counts[tech_to_add] += count
            else:
                tech_counts[tech_to_add] = count

        # Convert tech_counts back to an OrderedDict
        new_tech_ordered_dict = OrderedDict(tech_counts)

        combined_dict[job_title] = new_tech_ordered_dict

    return combined_dict

term_mapping and techs_to_remove will be moved to either their own file or a .env for easier manipulation

In [57]:
term_mapping = {
    'aws': ['amazon', 'aws'],
    'azure': ['azure']
}

In [61]:
techs_to_remove = ["none", "systems development lifecycle cycle", "security clearance", "devops engineer",
                   "collaborating across departments"]

In [62]:
result = combine_and_remove_technologies(ordered, techs_to_remove, term_mapping)

In [63]:
result

{'data science': OrderedDict([('python', 205),
              ('sql', 94),
              ('tableau', 32),
              ('tensorflow', 31),
              ('r', 27),
              ('uipath', 26),
              ('llms', 20),
              ('excel', 20),
              ('aws', 28),
              ('machine learning', 17),
              ('sas', 15),
              ('soc2', 14),
              ('power bi', 13),
              ('java', 12),
              ('pytorch', 10),
              ('apis', 10),
              ('databricks', 9),
              ('apache spark', 9),
              ('circle', 9),
              ('generative ai', 8),
              ('hadoop', 7),
              ('cloud computing', 7),
              ('nlp', 6),
              ('jira', 6),
              ('iot', 6),
              ('statistics', 6),
              ('llms and fms', 5),
              ('macros', 5),
              ('artificial intelligence', 5),
              ('ai', 5),
              ('automated tools', 5),
              ('pandas'