# Job List analysis

讀取 dataset 並輸出 submit.jsonl 檔案。

In [1]:
%matplotlib inline

import os, functools
import pandas as pd
import numpy as np

def getJobScore(jobno, df):
    return df.loc[int(jobno)].score

def sortJobScore(job_list, score_func):
    scores = [score_func(x) for x in job_list]
    indice = np.argsort(scores)[::-1]
    return [job_list[x] for x in indice]

def convJobScore(df):
    return df.clickJob + df.clickSave + df.clickApply


#### 讀取 Meta Data

取得之前的統計完成的工作 action 次數

In [2]:
# Read stat data of job

df_stat = pd.read_csv('data/job-action-stat.csv')
df_stat.set_index('jobno', inplace=True)
df_stat.head()

Unnamed: 0_level_0,clickJob,clickSave,clickApply,viewJob,applyJob,saveJob,score
jobno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4371531,0,0,0,94,1,2,0
9634973,2,0,0,437,5,4,2
4863490,2,0,0,279,5,3,2
4736769,1,0,0,255,7,4,1
4736818,0,0,0,235,3,0,0


#### 計算 Score 數值

計算每個工作的 Score 值，在之後的排序使用。

> Score 值的計算方式可以再修改，已得到更好的結果。

In [10]:
df_stat['score'] = convJobScore(df=df_stat)
df_stat.head()

Unnamed: 0_level_0,clickJob,clickSave,clickApply,viewJob,applyJob,saveJob,score
jobno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4371531,0,0,0,94,1,2,0
9634973,2,0,0,437,5,4,2
4863490,2,0,0,279,5,3,2
4736769,1,0,0,255,7,4,1
4736818,0,0,0,235,3,0,0


#### 讀取 test dataset 並使用 Job Score 計算新排序

In [4]:
test_file = 'data/testset-click-small.jsonl'
df_job = pd.read_json(test_file, lines=True)
df_job = df_job.filter(items=['id', 'joblist'])
df_job.head()

Unnamed: 0,id,joblist
0,1,"[10000096, 10425384, 5025420, 7874246, 1014796..."
1,2,"[10000096, 8741137, 10400774, 8606955, 1009176..."
2,3,"[10000169, 7262152, 8915983, 1054322, 1099425,..."
3,4,"[10000608, 10001090, 10092793, 10007902, 10113..."
4,5,"[10000611, 7035365, 8090325, 8492022, 5427407,..."


In [5]:
apply_func = functools.partial(sortJobScore,
                               score_func=functools.partial(getJobScore, df=df_stat))

%time df_job['joblist-score'] = df_job.joblist.apply(apply_func)


CPU times: user 2.76 s, sys: 55.7 ms, total: 2.81 s
Wall time: 2.85 s


#### 輸出資料為 submit 規定格式

In [6]:
df_job = df_job.filter(items=['id', 'joblist-score'])
df_job.rename(columns={'joblist-score':'joblist'}, inplace=True)
df_job.to_json('submit.jsonl', orient='records', lines=True)