In [1]:
import os, sys

import numpy as np
import pandas as pd

from collections import Counter, defaultdict

# Step 1: Load annotated job data
annoated by O*Net-SOC AutoCoder from user_history.tsv

In [2]:
df_title_annotated = pd.read_csv('../data/cb12/raw/title_annotated.csv', sep=';')

In [3]:
print(df_title_annotated.shape)
print("Unique JobTitle: ", len(df_title_annotated.JobTitle.unique()))
print("Unique JobTitle: ", len(df_title_annotated.JobTitle_token_new.unique()))
print("Unique Detailed Occupation: ", len(df_title_annotated.DetailedOccupation.unique()))

(30000, 14)
Unique JobTitle:  18616
Unique JobTitle:  17407
Unique Detailed Occupation:  682


### Remove Code='99-9999'

In [4]:
df_title_annotated = df_title_annotated[df_title_annotated['DetailedOccupation'] != '99-9999'] 
print(df_title_annotated.shape)
print("Unique JobTitle: ", len(df_title_annotated.JobTitle.unique()))
print("Unique JobTitle: ", len(df_title_annotated.JobTitle_token_new.unique()))
print("Unique Detailed Occupation: ", len(df_title_annotated.DetailedOccupation.unique()))

(29127, 14)
Unique JobTitle:  18172
Unique JobTitle:  17023
Unique Detailed Occupation:  681


### Remove Score < 70

In [5]:
df_title_annotated['Score_new'] = df_title_annotated['Score_new'].apply(lambda s: float(s))
df_title_annotated = df_title_annotated[df_title_annotated['Score_new'] >= 70] 
print(df_title_annotated.shape)
print("Unique JobTitle: ", len(df_title_annotated.JobTitle.unique()))
print("Unique JobTitle: ", len(df_title_annotated.JobTitle_token_new.unique()))
print("Unique Detailed Occupation: ", len(df_title_annotated.DetailedOccupation.unique()))

(22590, 14)
Unique JobTitle:  12908
Unique JobTitle:  11934
Unique Detailed Occupation:  563


### Remove rare MinorGroup

In [6]:
dict_count_MinorGroup = df_title_annotated.MinorGroup.value_counts().to_dict()
MinorGroup_retain = [MinorGroup for MinorGroup, times in dict_count_MinorGroup.items() if times >= 200]
print(len(MinorGroup_retain))

32


In [8]:
df_title_annotated = df_title_annotated[df_title_annotated.MinorGroup.isin(MinorGroup_retain)]

In [9]:
df_title_annotated.shape

(19312, 14)

# Step 3: Prepare transition data

### Combine all infor

In [10]:
all_UserID = [] #list of UserID
all_JobTitle = [] #list of JobTitle
all_DetailedOccupation = [] #list of DetailedOccupation
all_MinorGroup = [] #list of MinorGroup
all_MajorGroup = [] #list of MajorGroup
all_items = []


for index, rows in df_title_annotated.groupby('UserID'):
    UserID = rows['UserID'].unique()[0]
    all_UserID.append(UserID)
    
    JobTitle_thisResume = [] #JobTitle items in a resume
    DetailedOccupation_thisResume = [] #DetailedOccupation items in a resume
    MinorGroup_thisResume = [] #MinorGroup items in a resume
    MajorGroup_thisResume = [] #MajorGroup items in a resume 
    
    for title, MajorGroup, MinorGroup, DetailedOccupation in zip(rows['JobTitle_token_new'], rows['MajorGroup'], rows['MinorGroup'], rows['DetailedOccupation']):
        JobTitle_thisResume.append(title)
        DetailedOccupation_thisResume.append(DetailedOccupation)
        MinorGroup_thisResume.append(MinorGroup)
        MajorGroup_thisResume.append(MajorGroup)
        
    
    all_JobTitle.append(JobTitle_thisResume)
    all_DetailedOccupation.append(DetailedOccupation_thisResume)
    all_MinorGroup.append(MinorGroup_thisResume)
    all_MajorGroup.append(MajorGroup_thisResume)

In [11]:
transition = {"UserID": all_UserID, 
              "JobTitle": all_JobTitle, 
              "DetailedOccupation": all_DetailedOccupation,
              "MinorGroup": all_MinorGroup,
              "MajorGroup": all_MajorGroup}
df_transition_raw = pd.DataFrame(transition)
print(df_transition_raw.shape)

(6131, 5)


In [12]:
def Combine_allInf(DetailedOccupations, MinorGroups, MajorGroups, JobTitles):
    combined_EmploymentItems = []
    for DetailedOccupation, MinorGroup, MajorGroup, JobTitle in zip(DetailedOccupations, MinorGroups, MajorGroups, JobTitles):
        combined_EmploymentItem = (DetailedOccupation, MinorGroup, MajorGroup, JobTitle)
        combined_EmploymentItems.append(combined_EmploymentItem)
    
    return combined_EmploymentItems
    
df_transition_raw['Combined_EmploymentItems'] = df_transition_raw.apply(lambda row: Combine_allInf(row['DetailedOccupation'], row['MinorGroup'], row['MajorGroup'], row['JobTitle']), axis = 1)

In [13]:
print(df_transition_raw.shape)

(6131, 6)


### Remove instances with Valid_EmploymentItems < 2

In [17]:
df_transition_raw['Num_EmploymentItems'] = df_transition_raw.Combined_EmploymentItems.apply(lambda x: len(x))
df_transition_final = df_transition_raw[df_transition_raw['Num_EmploymentItems'] > 1] 
print(df_transition_final.shape)

(4907, 7)


In [18]:
df_transition_final.to_csv('../data/cb12/processed/job_transition_MinorGroup200.csv', sep=';')

In [3]:
df_transition = pd.read_csv('./data/cb12/processed/job_transition_MinorGroup200.csv', sep=';')

In [5]:
df_transition

Unnamed: 0,UserID,JobTitle,DetailedOccupation,MinorGroup,MajorGroup,Combined_EmploymentItems,filtered_EmploymentItems,Num_filtered_EmploymentItems
0,72,"['lecturer department anthropology', 'elderly ...","['25-1061', '31-1122', '43-6014', '27-3041', '...","['25-10', '31-10', '43-60', '27-30', '25-90', ...","['25', '31', '43', '27', '25', '39']","[('25-1061', '25-10', '25', 'lecturer departme...","[('31-1122', '31-10', '31', 'elderly caregiver...",2
1,131,"['data analyst', 'assistant business analyst']","['13-1111', '13-1161']","['13-10', '13-10']","['13', '13']","[('13-1111', '13-10', '13', 'data analyst'), (...","[('13-1111', '13-10', '13', 'data analyst'), (...",2
2,162,"['graduate assistant', 'treasurer', 'secretary...","['25-9044', '11-3031', '43-6014', '11-9179']","['25-90', '11-30', '43-60', '11-90']","['25', '11', '43', '11']","[('25-9044', '25-90', '25', 'graduate assistan...","[('11-3031', '11-30', '11', 'treasurer'), ('43...",3
3,178,"['materials manager', 'purchasing manager', 'p...","['11-3061', '11-3061', '13-1023', '13-1023']","['11-30', '11-30', '13-10', '13-10']","['11', '11', '13', '13']","[('11-3061', '11-30', '11', 'materials manager...","[('11-3061', '11-30', '11', 'materials manager...",4
4,344,"['restaurant server', 'cashier', 'cashier']","['35-3031', '41-2011', '41-2011']","['35-30', '41-20', '41-20']","['35', '41', '41']","[('35-3031', '35-30', '35', 'restaurant server...","[('35-3031', '35-30', '35', 'restaurant server...",3
...,...,...,...,...,...,...,...,...
4902,154827,"['customer service coordinator', 'customer ser...","['43-1011', '43-4051', '43-4051']","['43-10', '43-40', '43-40']","['43', '43', '43']","[('43-1011', '43-10', '43', 'customer service ...","[('43-1011', '43-10', '43', 'customer service ...",3
4903,154832,"['administrative assistant', 'inventory associ...","['43-6014', '53-7065', '41-2031']","['43-60', '53-70', '41-20']","['43', '53', '41']","[('43-6014', '43-60', '43', 'administrative as...","[('43-6014', '43-60', '43', 'administrative as...",3
4904,154862,"['accountant', 'account payable payroll specia...","['13-2011', '43-3031', '33-1012']","['13-20', '43-30', '33-10']","['13', '43', '33']","[('13-2011', '13-20', '13', 'accountant'), ('4...","[('13-2011', '13-20', '13', 'accountant'), ('4...",2
4905,154863,"['director technical operations', 'telecom net...","['27-2012', '15-1244', '13-1082']","['27-20', '15-12', '13-10']","['27', '15', '13']","[('27-2012', '27-20', '27', 'director technica...","[('15-1244', '15-12', '15', 'telecom network e...",2
