### Basic setup
  
- Imports
- Basic skill map
- Features

In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
import ast
import pickle
import sys
from tqdm.auto import tqdm
import os
tqdm.pandas()
from pathlib import Path
import requests
import json
import config

import time


warnings.filterwarnings("ignore")

In [2]:
def get_challenges():
    """
    returns a dict mapping skills to challenges
    """
    skill_det = pd.io.gbq.read_gbq(
    f"""

        SELECT
            mjs.skill_id,
            bas.skill_name,
            ARRAY_AGG(DISTINCT msc.challenge_id IGNORE NULLS) AS challenge_ids,
        FROM
            (
            SELECT DISTINCT skill_id FROM `turing-230020.raw.tpm_developer_skill` 
            UNION DISTINCT 
            SELECT DISTINCT skill_id FROM `turing-230020.raw.ms2_job_skill` 
            )mjs
        LEFT JOIN
            `raw.ms2_skill_challenges` msc
        ON
            mjs.skill_id = msc.skill_id
        LEFT JOIN
            `raw.base_all_skills_v4` bas
        ON
            mjs.skill_id = bas.id
        GROUP BY
        1,2


    """,project_id='turing-230020')
    
    challenge_map = {}
    name_map = {}
    for skill in skill_det.skill_id.values:
        challenge_map[skill] = skill_det[skill_det.skill_id == skill].challenge_ids.values[0]
        name_map[skill] = skill_det[skill_det.skill_id == skill].skill_name.values[0]

    return challenge_map, name_map

In [3]:
challenge_map, name_map = get_challenges()

### Active Jobs

### Skills of Active Jobs

In [4]:
# must have skills as a list of lists
job_skills = pd.io.gbq.read_gbq(
f"""


WITH forcasted_tuples AS 
(  
  SELECT 
    job_id, skill_name, id AS skill_id
  FROM
  (
    SELECT 
    job_id, 
    CASE 
      WHEN (skill_name = ' Laravel') THEN 'Laravel'
      WHEN (skill_name = 'Android: Kotlin') THEN 'Android/Kotlin'
      ELSE skill_name
    END AS skill_name
    FROM `turing-230020.product_ds_supply.forecasted_tuples`, UNNEST(SPLIT(Skill_Tuple, '|'))skill_name 
  ) LEFT JOIN `turing-230020.raw.base_all_skills_v4` USING(skill_name)
)


  SELECT
    job_id,
    `matchingmetrics`.structurize_skills(TO_JSON_STRING(ARRAY_AGG(DISTINCT skill_id
        ORDER BY
          skill_id))) AS skills
  FROM
    forcasted_tuples
  GROUP BY
    job_id 
"""
,project_id='turing-230020')
#  jsi.job_id in {tuple(active_jobs.job_id.unique())}
job_skills


Unnamed: 0,job_id,skills
0,13009,"[[93],[114],[165]]"
1,14030,"[[120],[125],[158]]"
2,13833,"[[65],[114],[557],[558]]"
3,14448,"[[120],[387],[2031]]"
4,11698,[[2117]]
...,...,...
192,11487,[[2031]]
193,13580,[[268]]
194,12885,[[1938]]
195,11007,[[433]]


In [5]:
skill_id_dict = {
      108: [108, 97, 448]
    , 567: [567, 2068]
    , 568: [568, 277]
    , 65: [65, 221]
    , 71: [1157, 71]
    , 257: [257, 443, 444, 467, 762, 189, 2140]
    , 70: [70, 60]
    , 351: [351, 394, 2084]
    , 541: [541, 204]
    , 1547: [1547, 460, 2114]
    , 1598: [1598, 483, 1571, 41]
    , 686: [686, 348, 347]
    , 25: [25, 2050]
    , 554: [554, 555]
    , 2096: [2096, 26]
    , 387: [387, 308]
    , 264: [264, 1408]
    , 358: [358, 1928, 2087]
    , 1821: [1821, 2061]
    , 114: [114, 86, 113, 2097]
    , 1938: [1696, 1123, 2150, 614]
    , 1025: [1025, 1389]
    , 127: [127, 162, 327]
    , 301: [301, 1991]
    , 1315: [1315, 464, 1675, 1676, 1258]
    , 2: [2, 3]
    , 2020: [2020, 401, 680]
    , 400: [400, 2032]
    , 223: [223, 1647]
    , 20: [20, 710, 1466]
    , 174: [174, 128]
    , 258: [258, 73, 1250]
    , 1286: [1286, 33, 1063, 812]
    , 1420: [1420, 1470]
    , 433: [433, 2133, 1827]
    , 449: [449, 1465]
    , 29: [29, 1688]
    , 166: [166, 173]
    , 107: [107, 309]
    , 125: [125, 1615, 111, 1059, 946, 328]
    , 2055: [2055, 425]
    , 93: [93, 598]
    , 2094: [2094, 256, 678, 2091, 2092, 1349, 1964, 397, 1300]
    , 2036: [2036, 1873, 1397, 2154, 1855]
    , 1939: [1939, 706, 1822, 153, 707, 1830]
}

job_skills['original_skills'] = job_skills['skills'] 

skills = []
for _, row in job_skills.iterrows():
    job_skill = []
    for skill in row['skills'].split(','):
        if skill.strip('][').split(', ')[0] != 'null':
            temp = int(skill.strip('][').split(', ')[0])
            if temp in skill_id_dict.keys():
                job_skill.append(skill_id_dict[temp])
            else:
                job_skill.append([temp])
    skills.append(job_skill)

job_skills['skills'] = skills

job_skills['skills'] = job_skills['skills'].astype(str)
job_skills

Unnamed: 0,job_id,skills,original_skills
0,13009,"[[93, 598], [114, 86, 113, 2097], [165]]","[[93],[114],[165]]"
1,14030,"[[120], [125, 1615, 111, 1059, 946, 328], [158]]","[[120],[125],[158]]"
2,13833,"[[65, 221], [114, 86, 113, 2097], [557], [558]]","[[65],[114],[557],[558]]"
3,14448,"[[120], [387, 308], [2031]]","[[120],[387],[2031]]"
4,11698,[[2117]],[[2117]]
...,...,...,...
192,11487,[[2031]],[[2031]]
193,13580,[[268]],[[268]]
194,12885,"[[1696, 1123, 2150, 614]]",[[1938]]
195,11007,"[[433, 2133, 1827]]",[[433]]


In [6]:
job_skills.skills = job_skills.skills.map(ast.literal_eval)
job_skills['num_skills'] = job_skills.skills.map(len)
job_skills['g_skills'] = job_skills.skills.apply(lambda r: [[{"skillId":s,'keyword':name_map[s]} for s in skills] for skills in sorted(r)])
job_skills['keywords'] = job_skills.skills.apply(lambda r: [name_map[s] for skill in r for s in skill])
job_skills['skill_comb'] = job_skills.g_skills.apply(lambda r: str(r))

job_skills_dict = {job['job_id']:job['skills'] for id,job in job_skills.iterrows()}
job_skills



Unnamed: 0,job_id,skills,original_skills,num_skills,g_skills,keywords,skill_comb
0,13009,"[[93, 598], [114, 86, 113, 2097], [165]]","[[93],[114],[165]]",3,"[[{'skillId': 93, 'keyword': 'JavaScript'}, {'...","[JavaScript, Javascript ES6, SQL, PostgreSQL, ...","[[{'skillId': 93, 'keyword': 'JavaScript'}, {'..."
1,14030,"[[120], [125, 1615, 111, 1059, 946, 328], [158]]","[[120],[125],[158]]",3,"[[{'skillId': 120, 'keyword': 'Node.js'}], [{'...","[Node.js, API Design, API Integrations , REST...","[[{'skillId': 120, 'keyword': 'Node.js'}], [{'..."
2,13833,"[[65, 221], [114, 86, 113, 2097], [557], [558]]","[[65],[114],[557],[558]]",4,"[[{'skillId': 65, 'keyword': 'Apache Spark'}, ...","[Apache Spark, Spark, SQL, PostgreSQL, MySQL, ...","[[{'skillId': 65, 'keyword': 'Apache Spark'}, ..."
3,14448,"[[120], [387, 308], [2031]]","[[120],[387],[2031]]",3,"[[{'skillId': 120, 'keyword': 'Node.js'}], [{'...","[Node.js, HTML, HTML5, React]","[[{'skillId': 120, 'keyword': 'Node.js'}], [{'..."
4,11698,[[2117]],[[2117]],1,"[[{'skillId': 2117, 'keyword': 'Shopify'}]]",[Shopify],"[[{'skillId': 2117, 'keyword': 'Shopify'}]]"
...,...,...,...,...,...,...,...
192,11487,[[2031]],[[2031]],1,"[[{'skillId': 2031, 'keyword': 'React'}]]",[React],"[[{'skillId': 2031, 'keyword': 'React'}]]"
193,13580,[[268]],[[268]],1,"[[{'skillId': 268, 'keyword': 'Machine Learnin...",[Machine Learning],"[[{'skillId': 268, 'keyword': 'Machine Learnin..."
194,12885,"[[1696, 1123, 2150, 614]]",[[1938]],1,"[[{'skillId': 1696, 'keyword': 'User-Centered ...","[User-Centered Design, UI Design, webflow, Figma]","[[{'skillId': 1696, 'keyword': 'User-Centered ..."
195,11007,"[[433, 2133, 1827]]",[[433]],1,"[[{'skillId': 433, 'keyword': 'AWS'}, {'skillI...","[AWS, AWS Operations, AWS Administration]","[[{'skillId': 433, 'keyword': 'AWS'}, {'skillI..."


### Must-Have Skill Retrieval

Replication of DE pipeline. 

In [7]:
tqdm.monitor_interval = 0

results = pd.DataFrame({'job_id':[],'userId':[],'score':[]})
resp = ''
for id,job in tqdm(job_skills.iterrows(),total=len(job_skills)):
    with requests.Session() as s:
        payload = json.load(open('api_payload_current_retrieval.json', 'r'))

        payload['skillKeywordSearch']['mustHave'] = job['g_skills']

        payload['searchAnywhereParameters']['mustHaveWords'] = job['keywords']
        payload['skillKeywordSearch']['mustHave'] = job['g_skills']
        payload['jobId'] = job['job_id']#.astype(str)


        json_payload = json.dumps(payload)
        content_length = str(len(json_payload))

        headers = {
            'Content-Type': 'application/json',
            'Content-Length': content_length,
            'authorization': config.bearer_token
        }

        resp = s.post(config.url, json_payload, headers = headers)
        try:
            resp_df = pd.DataFrame(json.loads(json.dumps(resp.json()['developers'])))
            resp_df['job_id'] = job['job_id']
            results = results.append(resp_df)
            print(f"job: {job['job_id']} : Done")
        except Exception as e:
            print(e)
            print(f"job: {job['job_id']} : Error - {resp.status_code}: {resp.text}")
            continue
results.shape


  0%|          | 0/197 [00:00<?, ?it/s]

job: 13009 : Done
job: 14030 : Done
job: 13833 : Done
job: 14448 : Done
job: 11698 : Done
job: 10507 : Done
job: 11373 : Done
'developers'
job: 10626 : Error - 500: {"statusCode":500,"timestamp":"2023-06-27T20:29:57.048Z","path":"/api/search/developer/matching/","method":"POST","message":"[{\"rankingUuid\":\"a1086879-5d00-4f09-a989-28f79cf55994\",\"message\":\"4 DEADLINE_EXCEEDED: Took too long to respond when processing endpoint_id: 222140931229351936, deployed_model_id: 6722363514687586304\",\"grpc_code\":4},{\"rankingUuid\":\"89c02189-d507-4ece-9108-fd7a6db195a4\",\"message\":\"13 INTERNAL: {\\\"detail\\\":\\\"Feature Service Error\\\"}\",\"grpc_code\":13},{\"rankingUuid\":\"79f84306-9bfb-4f0f-8255-79eccdd26142\",\"message\":\"4 DEADLINE_EXCEEDED: Took too long to respond when processing endpoint_id: 222140931229351936, deployed_model_id: 6722363514687586304\",\"grpc_code\":4},{\"rankingUuid\":\"c081732a-e160-4dfd-a001-ef687913ddee\",\"message\":\"13 INTERNAL: {\\\"detail\\\":\\\"Fe

(1127435, 3)

### Merge conditions, add miscellaneous conditions

In [8]:
results['Extraction_Date'] = time.strftime("%Y-%m-%d", time.localtime())
results_f = results.reset_index()[['Extraction_Date', 'job_id','userId', 'score']]
results_f.rename(columns = {'score' : 'v9b_score', 'userId':'dev_id'}, inplace = True)
results_f['job_id'] = results_f['job_id'].astype(int)
results_f['dev_id'] = results_f['dev_id'].astype(int)
results_f['type'] = 'Forecasted'



In [9]:
import pandas_gbq
# Replace 'your_project_id', 'your_dataset_id', and 'your_table_id' with your actual values
project_id = 'turing-230020'
dataset_id = 'product_ds_supply'
table_id = 'forecasted_tuples_supply'

# Insert the DataFrame into the BigQuery table

pandas_gbq.to_gbq(results_f, f'{dataset_id}.{table_id}', project_id=project_id, if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 12520.31it/s]
