## Imports

In [1]:
import pandas as pd
import datetime
from datetime import timedelta
import numpy as np
import itertools
from operator import itemgetter

from google.cloud import bigquery
from statistics import median

In [2]:
def query_result(query):
    """
    Runs the query and returns the output. 
    
    Args:
        query (str): The query text
        
    Returns:
        (df): Output of the query as a dataframe
    """

    client = bigquery.Client('turing-230020')
    output = client.query(query).to_dataframe()
    client.close()
    
    return output

## Active Jobs

In [3]:
def or_skill_consolidate(df):
    or_skills = []
    for _, row in df.iterrows():
        or_skill_job = []
        or_skill_job.append(row['skill_id'])
        if pd.isna(row['or_skill_id_1']) is False: 
            or_skill_job.append(row['or_skill_id_1'])
        if pd.isna(row['or_skill_id_2']) is False: 
            or_skill_job.append(row['or_skill_id_2'])
        if pd.isna(row['or_skill_id_3']) is False: 
            or_skill_job.append(row['or_skill_id_3'])
        if pd.isna(row['or_skill_id_4']) is False: 
            or_skill_job.append(row['or_skill_id_4'])
        or_skills.append(or_skill_job)

    df['skill_id'] = or_skills
    return df

In [4]:
doing_dedup = True

skill_mapping_query = """
SELECT DISTINCT
  id AS skill_id
, skill_name
, canonical_skill_id
FROM 
  turing-230020.raw.base_all_skills_v4
"""

canonical_ids_query = """
SELECT DISTINCT
  canonical_skill_id AS skill_id
, skill_name
FROM 
  turing-230020.raw.base_all_skills_v4
WHERE
  id IN (SELECT DISTINCT canonical_skill_id FROM turing-230020.raw.base_all_skills_v4)
"""

active_jobs_query_q3 = """
SELECT DISTINCT
  JB.job_id
, JB.skill_id
, JB.or_skill_id_1
, JB.or_skill_id_2
, JB.or_skill_id_3
, JB.or_skill_id_4
FROM 
  turing-230020.supply_ops.consolidated_job_data_with_OR_skill JB
WHERE 
  -- JB.job_id NOT IN (SELECT DISTINCT job_id FROM turing-230020.curated.test_jobs_master_table WHERE job_id IS NOT NULL)
  JB.job_id IN  (
    SELECT distinct job_id FROM turing-230020.product_ds_supply.active_job_level_supply_logging
    WHERE Extraction_date Between '2023-07-01' AND '2023-09-30'

    )

"""

canonical_ids = query_result(canonical_ids_query)

skill_mapping = query_result(skill_mapping_query)
skill_mapping['canonical_skill_id'] = skill_mapping.apply(lambda x: x['skill_id'] if x['canonical_skill_id'] == 0 else x['canonical_skill_id'], axis=1)

active_jobs_q3 = query_result(active_jobs_query_q3)
active_jobs_q3 = or_skill_consolidate(active_jobs_q3).groupby(['job_id'])['skill_id'].apply(list).reset_index()




In [5]:
jobs_of_interest = active_jobs_q3.copy()
#jobs_of_interest = all_jobs.copy()



In [6]:
extra_dict = {
    'GitHub': 'Git', 
    'GitLab': 'Git', 
    'Crawlers': 'WebScrape', 
    'webflow': 'UX Design', 
    'User-Centered Design': 'UX Design', 
    'Figma': 'UX Design', 
    'Unity 3D': 'Unity', 
    'Salesforce Lightning': 'Salesforce ', 
    'Salesforce Lightning Aura Components': 'Salesforce ', 
    'Python for Data Science': 'Python', 
    'Agile': 'Project Management', 
    'Scrum': 'Project Management', 
    'Javascript ES6': 'JavaScript', 
    'Integration Testing': 'Automation Testing', 
    'Functional Testing': 'Automation Testing', 
    'API Integrations  ': 'API Design', 
    'REST/RESTful APIs': 'API Design', 
    'Web API': 'API Design', 
    'API Documentation': 'API Design', 
    'Third party APIs': 'API Design'
}

remove_list = ['Video', 'Sound and Audio', 'SAS ML data science language', 'Asset Management', 'Requirement Gathering']

skill_ids = []
for _, row in jobs_of_interest.iterrows():
    job_skill_ids = []
    skills = row['skill_id']

    for skill in skills:
        or_skill_ids = []
        for or_skill in skill:
            if doing_dedup:
                if skill_mapping[skill_mapping['skill_id'] == or_skill]['canonical_skill_id'].iloc[0] in list(canonical_ids['skill_id']):
                    skill_name = canonical_ids[canonical_ids['skill_id'] == skill_mapping[skill_mapping['skill_id'] == or_skill]['canonical_skill_id'].iloc[0]]['skill_name'].iloc[0]
                    if skill_name in extra_dict.keys():
                        skill_name = extra_dict[skill_name]
                    or_skill_ids.append(skill_name)
                    if skill_name in remove_list:
                        or_skill_ids.remove(skill_name)
                else:
                    skill_name = skill_mapping[skill_mapping['skill_id'] == or_skill]['skill_name'].iloc[0]
                    if skill_name in extra_dict.keys():
                        skill_name = extra_dict[skill_name]
                    or_skill_ids.append(skill_name)
                    if skill_name in remove_list:
                        or_skill_ids.remove(skill_name)
            else:
                skill_name = skill_mapping[skill_mapping['skill_id'] == or_skill]['skill_name'].iloc[0]
                if skill_name in extra_dict.keys():
                    skill_name = extra_dict[skill_name]
                or_skill_ids.append(skill_name)
                if skill_name in remove_list:
                    or_skill_ids.remove(skill_name)
        job_skill_ids.append(or_skill_ids)
    
    skill_ids.append(job_skill_ids)

jobs_of_interest['Skill_Tuple'] = skill_ids

In [7]:
if doing_dedup:
    new_names = []
    for _, row in jobs_of_interest.iterrows():
        skills = row['Skill_Tuple']
        skills.sort()

        or_skills_names = []
        for skill in skills:
            or_skills_names.append(list(set(skill)))
        
        or_skills_names.sort()
        list_of_or_skills = list(num for num,_ in itertools.groupby(or_skills_names))
        list_of_or_skills = [ele for ele in list_of_or_skills if ele != []]

        new_names.append(list_of_or_skills)

    jobs_of_interest['Skill_Tuple'] = new_names

In [8]:
if doing_dedup:
    similar_skill_list = [['CakePHP', 'PHP'], 
                        ['Express.js', 'JavaScript'], 
                        ['Laravel', 'PHP'],
                        ['Next.js', 'JavaScript'],
                        ['Next.js', 'React'],
                        ['React Hooks', 'React'],
                        ['Ruby on Rails', 'Ruby'],
                        ['Salesforce Lightning Aura Components', 'Salesforce Lightning'],
                        ['Spring', 'Java'],
                        ['Spring Boot', 'Java'],
                        ['Jetpack Compose', 'Kotlin (for Android)'], 
                        ['Meteor.js', 'Node.js'],
                        ['Cplex', 'Python'],
                        ['RxJava', 'Java'],
                        ['Looker', 'SQL'],
                        ['Apollo Client', 'GraphQL'],
                        ['Apollo Server', 'GraphQL'],
                        ['Apollo Server', 'Node.js'],
                        ['ASP.NET Core', 'C#'],
                        ['WooCommerce', 'PHP'],
                        ['Symfony', 'PHP'],
                        ['ASP.NET/C#', 'ASP.NET'],
                        ['PyQt5', 'Python'],
                        ['SaltStack', 'Python'],
                        ['Azure Synapse', 'SQL'],
                        ['Numpy', 'Python'],
                        ['WooCommerce', 'WordPress'],
                        ['Tensorflow', 'Python'],
                        ['PySpark', 'Python'],
                        ['SQLalchemy', 'Python'],
                        ['Flux', 'JavaScript'],
                        ['Bootstrap', 'HTML/CSS'],
                        ['Qt', 'C++'],
                        ['Celery', 'Python'],
                        ['Dart', 'Flutter'],
                        ['Tensorflow', 'Machine Learning'],
                        ['.NET Core', '.NET'],
                        ['React Hooks', 'React'],
                        ['Salesforce Lightning Aura Components', 'Salesforce'],
                        ['SSIS', 'SQL'],
                        ['WPF', 'C#'],
                        ['Amazon Redshift', 'SQL'],
                        ['AngularJS', 'JavaScript'],
                        ['SCSS', 'HTML/CSS'],
                        ['Visual C++', 'C++'],
                        ['Webpack', 'JavaScript'],
                        ['FastAPI', 'Python'],
                        ['Tailwind CSS', 'HTML/CSS'],
                        ['Deep Learning', 'Python'],
                        ['Drupal', 'PHP'],
                        ['React', 'JavaScript'],
                        ['Typescript', 'JavaScript'],
                        ['Node.js', 'JavaScript'],
                        ['React Native', 'React'],
                        ['Django', 'Python'],
                        ['Data Engineering', 'SQL'],
                        ['.NET Core', 'C#'],
                        ['Kubernetes', 'Docker'],
                        ['.NET', 'C#'],
                        ['Express.js', 'Node.js'],
                        ['React Native', 'JavaScript'],
                        ['Vue.js', 'JavaScript'],
                        ['Redux', 'JavaScript'],
                        ['Flask', 'Python'],
                        ['jQuery', 'HTML/CSS'],
                        ['JUnit', 'Java'],
                        ['jQuery', 'JavaScript'],
                        ['Nest.js', 'Node.js'],
                        ['Airflow', 'Python'],
                        ['Nest.js', 'Typescript'],
                        ['ASP.NET', 'C#'],
                        ['Azure DevOps', 'Azure'],
                        ['PyTorch', 'Python'],
                        ['Deep Learning', 'Machine Learning'],
                        ['Cypress', 'JavaScript'],
                        ['Mockito', 'Java'],
                        ['Java (for Android)', 'Android'],
                        ['Django Rest Framework', 'Django'],
                        ['WordPress', 'HTML/CSS'],
                        ['Django Rest Framework', 'Python'],
                        ['Salesforce Lightning', 'Salesforce'],
                        ['Pandas', 'Python'],
                        ['Material UI', 'React'], 
                        ['Spring Boot', 'Spring'],
                        ['React Hooks', 'JavaScript'],
                        ['Python', 'Machine Learning'],
                        ['JavaScript', 'JavaScript Frameworks'],
                        ['D3.js', 'JavaScript'],
                        ['Angular', 'Typescript'], 
                        ['MySQL', 'SQL'], 
                        ['PostgreSQL', 'SQL'], 
                        ['BigQuery', 'SQL'], 
                        ['Tableau', 'Business Intelligence (BI)'], 
                        ['Power BI', 'Business Intelligence (BI)'], 
                        ['Looker', 'Business Intelligence (BI)']]

    new_tuples = []
    for _, row in jobs_of_interest.iterrows():
        tuple_final = row['Skill_Tuple'].copy()
        
        single_tuple = []
        for or_tuple in tuple_final:
            if len(or_tuple) == 1:
                single_tuple.extend(or_tuple)

        for skill_list in similar_skill_list:
            if all(item in single_tuple for item in skill_list):
                tuple_final = [[ele for ele in sub if ele != skill_list[1]] for sub in tuple_final]
                tuple_final = [ele for ele in tuple_final if ele != []]

        new_tuples.append(tuple_final)

    jobs_of_interest['Skill_Tuple'] = new_tuples
    jobs_of_interest

## Save Active Jobs

In [9]:
temp = pd.DataFrame(columns=['job_id', 'skill_id', 'or_skill_id_1', 'or_skill_id_2', 'or_skill_id_3', 'or_skill_id_4'])

for _, row in jobs_of_interest.copy().iterrows():
    skill_tuples = row['Skill_Tuple'].copy()
    for skill_tuple in skill_tuples:
        for i in range (5 - len(skill_tuple)):
            skill_tuple.append(None)
        row1 = pd.Series([row['job_id'], skill_tuple[0], skill_tuple[1], skill_tuple[2], skill_tuple[3], skill_tuple[4]], index=temp.columns)
        temp = temp.append(row1,ignore_index=True) 

temp.to_csv('active_jobs_q3_V2.csv', index=False)

  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = temp.append(row1,ignore_index=True)
  temp = t

In [10]:
temp.head()

Unnamed: 0,job_id,skill_id,or_skill_id_1,or_skill_id_2,or_skill_id_3,or_skill_id_4
0,3,Node.js,,,,
1,3,React,,,,
2,5,React,,,,
3,6,Go,,,,
4,8,Image Recognition,,,,


In [11]:
temp['job_id'] = temp['job_id'].astype(int)
temp['skill_id'] = temp['skill_id'].astype(str)
temp.dtypes

job_id            int64
skill_id         object
or_skill_id_1    object
or_skill_id_2    object
or_skill_id_3    object
or_skill_id_4    object
dtype: object

In [12]:
import pandas_gbq
# Replace 'your_project_id', 'your_dataset_id', and 'your_table_id' with your actual values
project_id = 'turing-dev-337819'
dataset_id = 'pdsa'
table_id = 'active_jobs_q3_V2'

# Insert the DataFrame into the BigQuery table

#pandas_gbq.to_gbq(temp, f'{dataset_id}.{table_id}', project_id=project_id, if_exists='replace')

100%|██████████| 1/1 [00:00<00:00, 1868.29it/s]
