In [1]:
import pickle
import os, sys

import numpy as np
import pandas as pd

from itertools import chain
from collections import Counter, defaultdict

# Step 1: Load job transition data

Each line is a job transition record

In [3]:
df_transition = pd.read_csv('../data/cb12/processed/job_transition_MinorGroup200.csv', sep=';')

# Step 2: Create JobTitle file

In [4]:
list_UserId = []
list_JobTitle = []
list_DetailedOccupation = []
list_MinorGroup = []
list_MajorGroup = []

for idx, row in df_transition.iterrows():
    UserID = row['UserID']
    for item in eval(row['Combined_EmploymentItems']):
        list_UserId.append(UserID)
        list_DetailedOccupation.append(item[0])
        list_MinorGroup.append(item[1])
        list_MajorGroup.append(item[2])
        list_JobTitle.append(item[3])
    
    
df_title = {"UserId": list_UserId, 
            "JobTitle": list_JobTitle, 
            "DetailedOccupation": list_DetailedOccupation, 
            "MinorGroup": list_MinorGroup, 
            "MajorGroup": list_MajorGroup}
df_title = pd.DataFrame(df_title)
df_title.index.name = 'Index'


print(df_title.shape)   

df_title.to_csv('../data/cb12/processed/job_title_raw.csv', sep=';')
f_out = open(b'../data/cb12/processed/job_title_raw.pkl', 'wb')
pickle.dump(df_title, f_out)

(18088, 5)


# Step 3: Tokenize job title

Use your own tokenizer or load directly job_title_raw.nlp.token.pkl

In [5]:
with open('../data/cb12/processed/job_title_raw.nlp.token.pkl', 'rb') as f:
    df_title_nlp = pickle.load(f)

In [6]:
vocab = set(chain(chain.from_iterable(df_title_nlp['JobTitle_token'].str.split())))
print('Vocab size: {}'.format(len(vocab)))
word_to_id = {token: idx for idx, token in enumerate(vocab)}

Vocab size: 3619


In [7]:
freqs = dict(Counter(chain(chain.from_iterable(df_title_nlp['JobTitle_token'].str.split()))).most_common())
freqs_1 = {k:v for k,v in dict(freqs).items() if v==1}
freqs_ = {k:v for k,v in dict(freqs).items() if v>1}

In [8]:
len(freqs_1)/len(freqs)

0.5355070461453441

### Vocab dictionary

In [9]:
word_to_id = {token: idx+1 for idx, token in enumerate(freqs_)}
print(len(word_to_id))
word_to_id["UNKNOWN"] = 0
print(len(word_to_id))

1681
1682


In [10]:
df_title = df_title.merge(df_title_nlp, on='Index')

In [11]:
def Word2Id(title):
    title = title.split(" ")
    idx = []
    for t in title:
        if t in word_to_id:
            idx.append(word_to_id[t])
        else:
            idx.append(word_to_id["UNKNOWN"])
        
    return idx

In [12]:
df_title['JobTitle_token_Id'] = df_title.JobTitle_token.apply(lambda x: Word2Id(x))
df_title.dropna(how='any', inplace=True)

In [13]:
vocab_lemma = set(chain(chain.from_iterable(df_title['JobTitle_token_Id'])))
print('Vocab size: {}'.format(len(vocab_lemma)))

Vocab size: 1682


In [14]:
df_title.to_csv('../data/cb12/processed/df_title_MinorGroup200_tokenized.csv', sep=';')

# Step 4: Generate tags

In [19]:
df_gazetteer = pd.read_csv('../data/IPOD-master/gazetteer.csv')
print(df_gazetteer.shape)
df_gazetteer['A'] = [max(list(row[['A1', 'A2', 'A3']]), key=list(row[['A1', 'A2', 'A3']]).count) for idx, row in df_gazetteer.iterrows()]
print(df_gazetteer.shape)

(1500, 4)
(1500, 5)


### Top-200 has 197 in IPOD

In [30]:
Tags = list(Counter(chain(chain.from_iterable(df_title_nlp['JobTitle_token'].str.split()))).most_common(203))

In [31]:
Tags = [t[0] for t in Tags if t[0] in list(df_gazetteer.Title)]

In [35]:
with open('../data/cb12/processed/Tags_MinorGroup200' + '_' + str(len(Tags)) + '.txt', 'w') as f:
    for v in Tags:
        f.write(v)
        f.write('\n')