In [2]:
import json
import os
from collections import Counter, defaultdict, OrderedDict
import copy
from datetime import datetime

In [53]:
def order_terms(counter):
    ord = {k: OrderedDict(sorted(v.items(), key=lambda x: float(x[1]), reverse=True)) for k, v in counter.items()}
    return ord

In [58]:
def get_terms(filelist):
    counts = defaultdict(Counter)
    for file in filelist:
        with open(fr"data/{file}") as f:
            data = json.load(f)
            for key in list(data.keys()):
                if key.startswith("metadata"):
                    continue
                else:
                    for term in data[key]['terms']:
                        try:
                            for tech in data[key]['techs'][0].split("\n"):
                                clean_tech = tech.strip()
                                clean_tech = clean_tech.replace("-", "")
                                if tech in counts[term]:
                                    counts[term][clean_tech] += 1
                                else:
                                    counts[term][clean_tech] = 1
                        except IndexError:
                            # print(f"There were no techs in {key} (term: {term})")
                            continue

    return counts
        

In [59]:
def get_filelist(start_date, end_date, folder_path='data'):
    start_date = datetime.strptime(start_date, "%d-%m-%y")
    end_date = datetime.strptime(end_date, "%d-%m-%y")
    
    files_between_dates = []
    
    for filename in os.listdir(folder_path):
        if filename.startswith('p-raw'):
            try:
                # Extract the date from the file name
                file_date = datetime.strptime(filename[11:19], "%d-%m-%y")
                if start_date <= file_date <= end_date:
                    files_between_dates.append(filename)
            except ValueError:
                # In case the date in the file name is not in the expected format
                pass
    
    return files_between_dates

In [60]:
filelist = get_filelist("11-09-23", "17-09-23")
ordered = order_terms(get_terms(filelist))

In [61]:
ordered

{'data science': OrderedDict([('python', 42),
              ('sql', 21),
              ('tensorflow', 8),
              ('llms', 7),
              ('tableau', 7),
              ('macros', 5),
              ('power bi', 5),
              ('java', 5),
              ('python (numpy', 4),
              ('excel', 3),
              ('react native', 3),
              ('bigquery', 3),
              ('iot', 3),
              ('pytorch', 3),
              ('apis', 2),
              ('powerbi', 2),
              ('sql server', 2),
              ('azure devops', 2),
              ('liftmaster', 2),
              ('azure cognitive services', 2),
              ('llms and fms', 2),
              ('none', 2),
              ('aws', 2),
              ('regulatory submissions', 2),
              ('google cloud', 2),
              ('jira', 2),
              ('python programming', 2),
              ('etl', 2),
              ('crm tool', 2),
              ('envision', 2),
              ('sas', 2),
         

In [62]:
def print_top_n(ordered, n=5):
    for key in ordered.keys():
        tops = list(ordered[key].items())[:n]
        print(f"Top {n} skills for {key}: {tops} \n")

In [63]:
print_top_n(ordered)

Top 5 skills for data science: [('python', 42), ('sql', 21), ('tensorflow', 8), ('llms', 7), ('tableau', 7)] 

Top 5 skills for machine learning engineer: [('python', 33), ('tensorflow', 11), ('sql', 6), ('llms', 5), ('java', 4)] 

Top 5 skills for mlops: [('llms', 8), ('aws', 6), ('python', 5), ('azure devops', 4), ('terraform', 4)] 

Top 5 skills for data analyst: [('systems development lifecycle cycle', 68), ('none', 14), ('sql', 11), ('jira', 6), ('tableau', 6)] 

Top 5 skills for data engineer: [('python', 14), ('sql', 10), ('ansys software', 8), ('java', 6), ('python (numpy', 5)] 



In [65]:
# Should make this its own file or maybe an env var, read it in and add to it as needed
techs_to_remove = ["none", "systems development lifecycle cycle", "security clearance", "devops engineer"]

def clean_ordered_terms(ordered, techs_to_remove):
    ordcopy = copy.deepcopy(ordered)
    for k1, v1 in ordered.items():
        for k2, v2 in v1.items():
            if k2 in techs_to_remove:
                print(f"Removing tech: {k2}")
                del ordcopy[k1][k2]
            else:
                continue
    return ordcopy

In [67]:
cleaned_ord = clean_ordered_terms(ordered, techs_to_remove)

Removing tech: none
Removing tech: none
Removing tech: none
Removing tech: devops engineer
Removing tech: systems development lifecycle cycle
Removing tech: none
Removing tech: security clearance
Removing tech: none


In [68]:
print_top_n(cleaned_ord)

Top 5 skills for data science: [('python', 42), ('sql', 21), ('tensorflow', 8), ('llms', 7), ('tableau', 7)] 

Top 5 skills for machine learning engineer: [('python', 33), ('tensorflow', 11), ('sql', 6), ('llms', 5), ('java', 4)] 

Top 5 skills for mlops: [('llms', 8), ('aws', 6), ('python', 5), ('azure devops', 4), ('terraform', 4)] 

Top 5 skills for data analyst: [('sql', 11), ('jira', 6), ('tableau', 6), ('excel', 4), ('microsoft office suite', 3)] 

Top 5 skills for data engineer: [('python', 14), ('sql', 10), ('ansys software', 8), ('java', 6), ('python (numpy', 5)] 



# Now must add to each p-raw.json
Need to keep the original techs, add a 'cleaned_techs' or something as a new key with the techs we don't want removed.

Also want to create some rules for techs that can be allocated together e.g. azure cloud, azure synapse, etc. can all be mapped to 'azure'.
Use re to match the words, be careful of partial matches.  

The above cleaning should be added to clean_ordered_terms as well (or maybe just there, and use those results to update the .jsons, think on it)