In [1]:
import pandas as pd
import numpy as np
from pyfasttext import FastText
from pprint import pprint
import os
import json
from bidict import bidict
import random
from math import ceil, floor
import pickle
from bson.objectid import ObjectId
from collections import Counter, defaultdict
import re
from nltk.corpus import stopwords

In [6]:
#df = pd.read_pickle("/data/rali7/Tmp/solimanz/data/pickles/excerpt-2017-02-20_transformed.pkl")
df = pd.read_pickle("/data/rali7/Tmp/solimanz/data/pickles/excerpt-2017-02-20_reduced.pkl")

In [3]:
func_counts = df.transformed.value_counts()
top = func_counts[:550]

In [7]:
# Second cleaning method
func_counts = df.reduced.value_counts()
top = func_counts[:7000]

In [13]:
bad_ids = df[~df.reduced.isin(top.index)]["_id"].unique()
all_ids = df["_id"].unique()
dataset_ids = list(set(all_ids) - set(bad_ids))

In [14]:
print(f"Size of entire dataset: {len(dataset_ids)}")

Size of entire dataset: 837910


In [15]:
random.seed(1234)

In [16]:
train_size = ceil(0.8 * len(dataset_ids))
random.shuffle(dataset_ids)
train_ids = dataset_ids[:train_size]
test_ids = dataset_ids[train_size:]

Create a mapping between job title id and string representation

In [17]:
job_titles = top.index.values
title_id = {title: i for i, title in enumerate(job_titles)}

Keep a record of the training and testing IDs for later experiments

In [18]:
with open("/data/rali7/Tmp/solimanz/data/datasets/top7000/train_ids.pkl", "wb")as f:
    pickle.dump(file=f, obj=train_ids)
with open("/data/rali7/Tmp/solimanz/data/datasets/top7000/test_ids.pkl", "wb")as f:
    pickle.dump(file=f, obj=test_ids)

# DS1: Simple Job Titles Sequences

In [23]:
ds1_path = "/data/rali7/Tmp/solimanz/data/datasets/top7000/1/"
ds1_file_name = "title_sequences"

In [24]:
df = df[df._id.isin(dataset_ids)]

In [25]:
func_series = df.groupby('_id')['reduced'].apply(lambda x: list(reversed(list(x))))
#func_series = df.groupby('_id')['transformed'].apply(lambda x: list(reversed(list(x))))

In [26]:
train_data = [[title_id[title] for title in func_series[i]] for i in train_ids]
test_data = [[title_id[title] for title in func_series[i]] for i in test_ids]

In [27]:
max_train_seq = max([len(seq) for seq in train_data])
max_test_seq = max([len(seq) for seq in test_data])

### Length of Longest Sequence

In [28]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 53
Maximum length of test sequences: 65


Dump to JSON

In [29]:
len(top)

7000

In [31]:
data = {
        'title_to_id': title_id,
        'train_data': train_data,
        'test_data': test_data,
        'maximum_seq_len': max(max_train_seq, max_test_seq),
        'n_lables': len(top)
    }

with open(os.path.join(ds1_path, f"{ds1_file_name}.json"), 'w') as f:
    json.dump(data, f)

# DS2: Add Start of Sequence tags to DS1 Sequences

In [32]:
ds2_path = "/data/rali7/Tmp/solimanz/data/datasets/top7000/2/"
ds2_file_name = "title_sequences"

In [33]:
with open(os.path.join(ds1_path, f"{ds1_file_name}.json"), 'r') as f:
    data = json.load(f)

In [34]:
title_id = data["title_to_id"]
train_data = data["train_data"] 
test_data = data["test_data"]

In [35]:
title_id['<START>'] = len(title_id)
start_tag = title_id['<START>']
_ =[seq.insert(0, start_tag) for seq in train_data]
_ =[seq.insert(0, start_tag) for seq in test_data]

In [36]:
max_train_seq = max([len(seq) for seq in train_data])
max_test_seq = max([len(seq) for seq in test_data])

### Length of Longest Sequence

In [37]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 54
Maximum length of test sequences: 66


Dump JSON

In [38]:
data = {
        'title_to_id': title_id,
        'train_data': train_data,
        'test_data': test_data,
        'maximum_seq_len': max(max_train_seq, max_test_seq)
    }
with open(os.path.join(ds2_path, f"{ds2_file_name}.json"), 'w') as f:
    json.dump(data, f)

# DS3: Job Sequences as Sequences of Bag-of-Words

In this dataset, we represent our job experience sequences as sequences of bag-of-words vectors that we will feed to the LSTM

In [25]:
ds3_path = "/data/rali7/Tmp/solimanz/data/datasets/3/"
ds3_file_name = "title_sequences"

In [26]:
with open("/data/rali7/Tmp/solimanz/data/datasets/train_ids.pkl", "rb")as f:
    train_ids = pickle.load(file=f)
with open("/data/rali7/Tmp/solimanz/data/datasets/test_ids.pkl", "rb")as f:
    test_ids = pickle.load(file=f)

In [27]:
joined = " ".join(top_550.index.values)
tokens = re.split(r"[-/,\.\\\s]", joined)
token_counts = Counter(tokens)

In [28]:
vocab = set(token_counts.keys())

In [29]:
sw = stopwords.words('english')
for word in sw:
    if word in vocab:
        vocab.remove(word)
if '' in vocab:
    vocab.remove('')

In [30]:
vocab_id = {l: i for i, l in enumerate(vocab)}

In [31]:
voc_size = len(vocab_id)
bow = {}
print(f"Vocabulary Size: {voc_size}")

Vocabulary Size: 329


In [32]:
for title in func_counts.index:
    tokens = re.split(r"[-/,\.\\\s]", title)
    token_indices = [vocab_id[tok] for tok in tokens if tok in vocab_id]
    bow[title] = sorted(token_indices)

In [33]:
df = df[df._id.isin(dataset_ids)]

In [34]:
func_series = df.groupby('_id')['transformed'].apply(lambda x: list(reversed(list(x))))

In [35]:
train_seqs = [[title for title in func_series[i]] for i in train_ids]
test_seqs = [[title for title in func_series[i]] for i in test_ids]

In [36]:
# Inputs
train_inputs = [[bow[title] for title in seq[:-1]] for seq in train_seqs] 
test_inputs = [[bow[title] for title in seq[:-1]] for seq in test_seqs]

# Targets
train_targets = [[title_id[title] for title in seq[1:]] for seq in train_seqs]
test_targets = [[title_id[title] for title in seq[1:]] for seq in test_seqs]

In [37]:
max_train_seq = max([len(seq) for seq in train_seqs])
max_test_seq = max([len(seq) for seq in test_seqs])

In [38]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 32
Maximum length of test sequences: 19


Dump to JSON

In [39]:
data = {
        'title_to_id': title_id,
        'title_to_bow': bow,
        'vocab_id': vocab_id,
        'train_inputs': train_inputs,
        'test_inputs': test_inputs,
        'train_targets': train_targets,
        'test_targets': test_targets,
        'maximum_seq_len': max(max_train_seq, max_test_seq)
    }
with open(os.path.join(ds3_path, f"{ds3_file_name}.json"), 'w') as f:
    json.dump(data, f)

# DS4: Add Job Duration to DS3 Feature Vectors

In [46]:
ds4_path = "/data/rali7/Tmp/solimanz/data/datasets/4/"
ds4_file_name = "title_sequences_durations"

In [47]:
df = df[df._id.isin(dataset_ids)]

In [48]:
func_series = df.groupby('_id')['transformed'].apply(lambda x: list(reversed(list(x))))
duration_series = df.groupby('_id')['duration'].apply(lambda x: list(reversed(list(x))))

In [49]:
func_series['52b31c980b045119318b9d64']

['director', 'instructor', 'instructor', 'substitute teacher']

In [50]:
duration_series[324]

[15.17, 0.0, 5.83, 11.75]

In [51]:
train_seqs = [[title for title in func_series[i]] for i in train_ids]
test_seqs = [[title for title in func_series[i]] for i in test_ids]

In [52]:
# Inputs we will concatenate the duration value with the corresponding bow in the batcher
train_inputs = [[bow[title] for title in seq[:-1]] for seq in train_seqs]
train_duration = [[dur for dur in duration_series[i][:-1]] for i in train_ids]
test_inputs = [[bow[title] for title in seq[:-1]] for seq in test_seqs]
test_duration = [[dur for dur in duration_series[i][:-1]] for i in test_ids]

# Targets
train_targets = [[title_id[title] for title in seq[1:]] for seq in train_seqs]
test_targets = [[title_id[title] for title in seq[1:]] for seq in test_seqs]

In [53]:
max_train_seq = max([len(seq) for seq in train_seqs])
max_test_seq = max([len(seq) for seq in test_seqs])

In [54]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 32
Maximum length of test sequences: 22


Dump to JSON

In [55]:
data = {
        'title_to_id': title_id,
        'title_to_bow': bow,
        'vocab_id': vocab_id,
        'train_inputs': train_inputs,
        'train_durations': train_duration,
        'test_inputs': test_inputs,
        'test_duration': test_duration,
        'train_targets': train_targets,
        'test_targets': test_targets,
        'maximum_seq_len': max(max_train_seq, max_test_seq)
    
    }
with open(os.path.join(ds4_path, f"{ds4_file_name}.json"), 'w') as f:
    json.dump(data, f)

# DS5: Input Vectors as Word Embeddings Mutli Label Targets(Fasttext)

In [56]:
ds5_path = "/data/rali7/Tmp/solimanz/data/datasets/5/"
ds5_file_name = "title_embedding_sequences_multi_label"

In [57]:
with open(os.path.join(ds1_path, f"{ds1_file_name}.json"), 'r') as f:
    data = json.load(f)

In [58]:
model = FastText("/data/rali7/Tmp/solimanz/data/wikipedia/wiki.en.bin")

In [59]:
emb_dim = 300

In [60]:
title_id = bidict(data["title_to_id"])
train_seqs = data["train_data"]
test_seqs = data["test_data"]

In [61]:
embeddings = np.zeros((len(title_id), emb_dim), dtype=np.float32)

for title in title_id.keys():
    if len(title.split(" ")) == 1:
        vec = model.get_numpy_vector(title)
    else:
        vec = model.get_sentence_vector(title)
    embeddings[title_id[title], :] = vec

Targets will be represented as multilabel vectors

In [62]:
# Inputs
train_inputs = [[title for title in seq[:-1]] for seq in train_seqs] 
test_inputs = [[title for title in seq[:-1]] for seq in test_seqs]

# Targets
train_targets = [[title_id.inv[title] for title in seq[1:]] for seq in train_seqs]
test_targets = [[title_id.inv[title] for title in seq[1:]] for seq in test_seqs]

In [63]:
titles = " ".join(title_id.keys())
tokens = Counter(re.split(r"[-/,\.\\\s_]", titles))

In [64]:
tokens = set(tokens.keys())

In [65]:
sw = stopwords.words('english')
sw.append('')
sw.append(' ')
for word in sw:
    if word in tokens:
        tokens.remove(word)

In [66]:
token_id = {tok: i for i, tok in enumerate(tokens)}

In [67]:
print(f"Number of Tokens: {len(token_id)}")
print(f"Number of Titles: {len(title_id)}")

Number of Tokens: 326
Number of Titles: 550


In [68]:
train_targets = [[[token_id[tok] for tok in re.split(r"[-/,\.\\\s_]", title) if tok in token_id] 
  for title in seq] for seq in train_targets]
test_targets = [[[token_id[tok] for tok in re.split(r"[-/,\.\\\s_]", title) if tok in token_id] 
  for title in seq] for seq in test_targets]

In [69]:
max_train_seq = max([len(seq) for seq in train_seqs])
max_test_seq = max([len(seq) for seq in test_seqs])

In [70]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 32
Maximum length of test sequences: 22


In [71]:
data = {
        'title_to_id': dict(title_id),
        'token_id': token_id,
        'train_inputs': train_inputs,
        'test_inputs': test_inputs,
        'train_targets': train_targets,
        'test_targets': test_targets,
        'maximum_seq_len': max(max_train_seq, max_test_seq),
        'emb_dim': emb_dim
    
    }
with open(os.path.join(ds5_path, f"{ds5_file_name}.json"), 'w') as f:
    json.dump(data, f)

In [72]:
np.save(os.path.join(ds5_path, "embeddings_small.npy"), embeddings)

# DS6: Multi Label Data Representation (Larger Dataset)

In [5]:
ds6_path = "/data/rali7/Tmp/solimanz/data/datasets/6/"
ds6_file_name = "big_title_embedding_sequences_multi_label"

In [6]:
with open("/data/rali7/Tmp/solimanz/data/datasets/train_ids.pkl", "rb")as f:
    train_ids = pickle.load(file=f)
with open("/data/rali7/Tmp/solimanz/data/datasets/test_ids.pkl", "rb")as f:
    test_ids = pickle.load(file=f)

In [7]:
def reduce(s):
    if pd.isna(s) or pd.isnull(s):
        return np.nan
    toks = re.split(r"[-/,\.\\\s_]", s)
    token_indices = [tok for tok in toks if tok in token_id]
    if token_indices:
        return " ".join(token_indices)
    else:
        return np.nan

In [8]:
tokens = set(re.split(r"[-/,\.\\\s_]", " ".join(top_550.index.values)))

In [9]:
sw = stopwords.words('english')
sw.append('')
sw.append(' ')
for word in sw:
    if word in tokens:
        tokens.remove(word)

In [10]:
token_id = {tok: i for i, tok in enumerate(tokens)}

In [11]:
df['reduced'] = df['transformed'].apply(reduce)

In [12]:
all_ids = df._id.unique()
bad_ids = df[pd.isna(df.reduced)]._id.unique()
len(all_ids) - len(bad_ids)

2198993

In [13]:
df = df[~df._id.isin(bad_ids)]
titles = df.reduced.unique()
dataset_ids = df._id.unique()

In [20]:
title_id = {title: i for i, title in enumerate(titles)}

In [14]:
train_set_size = ceil(0.8 * len(dataset_ids))

In [15]:
random.seed(1234)

In [16]:
random.shuffle(dataset_ids)
train_ids = dataset_ids[:train_set_size]
test_ids = dataset_ids[train_set_size:]

In [17]:
with open("/data/rali7/Tmp/solimanz/data/datasets/train_ids_big.pkl", "wb")as f:
    pickle.dump(file=f, obj=train_ids)
with open("/data/rali7/Tmp/solimanz/data/datasets/test_ids_big.pkl", "wb")as f:
    pickle.dump(file=f, obj=test_ids)

In [18]:
func_series = df.groupby('_id')['reduced'].apply(lambda x: list(reversed(list(x))))

In [21]:
train_data = [[title_id[title] for title in func_series[i]] for i in train_ids]
test_data = [[title_id[title] for title in func_series[i]] for i in test_ids]

In [22]:
max_train_seq = max([len(seq) for seq in train_data])
max_test_seq = max([len(seq) for seq in test_data])

### Length of Longest Sequence

In [23]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 89
Maximum length of test sequences: 56


In [24]:
print(f"Number of Tokens: {len(token_id)}")
print(f"Number of Titles: {len(title_id)}")

Number of Tokens: 326
Number of Titles: 731408


In [27]:
del df

In [28]:
model = FastText("/data/rali7/Tmp/solimanz/data/wikipedia/wiki.en.bin")

In [29]:
emb_dim = 300

In [30]:
embeddings = np.zeros((len(title_id), emb_dim), dtype=np.float32)

for title in title_id.keys():
    vec = model.get_sentence_vector(title)
    embeddings[title_id[title], :] = vec

Targets will be represented as multilabel vectors

In [32]:
title_id = bidict(title_id)

In [33]:
# Inputs
train_inputs = [[title for title in seq[:-1]] for seq in train_data] 
test_inputs = [[title for title in seq[:-1]] for seq in test_data]

# Targets
train_targets = [[title_id.inv[title] for title in seq[1:]] for seq in train_data]
test_targets = [[title_id.inv[title] for title in seq[1:]] for seq in test_data]

In [34]:
train_targets = [[[token_id[tok] for tok in re.split(r"[-/,\.\\\s_]", title) if tok in token_id] 
  for title in seq] for seq in train_targets]
test_targets = [[[token_id[tok] for tok in re.split(r"[-/,\.\\\s_]", title) if tok in token_id] 
  for title in seq] for seq in test_targets]

In [36]:
data = {
        'title_to_id': dict(title_id),
        'token_id': token_id,
        'train_inputs': train_inputs,
        'test_inputs': test_inputs,
        'train_targets': train_targets,
        'test_targets': test_targets,
        'maximum_seq_len': max(max_train_seq, max_test_seq),
        'emb_dim': emb_dim
    
    }
with open(os.path.join(ds6_path, f"{ds6_file_name}.json"), 'w') as f:
    json.dump(data, f)

In [37]:
np.save(os.path.join(ds6_path, "embeddings_big.npy"), embeddings)

In [43]:
random.sample(list(func_series.values), 10)

[['co owner', 'co owner', 'owner'],
 ['associate', 'partner', 'partner'],
 ['analyst',
  'information technology consultant',
  'project manager',
  'program manager',
  'director',
  'director',
  'director'],
 ['educator',
  'project coordinator',
  'associate project manager',
  'project manager'],
 ['investment representative',
  'operations associate',
  'financial service representative'],
 ['sales associate', 'sales associate', 'financial services associate'],
 ['field operator', 'field sales', 'field supervisor'],
 ['security guard',
  'sales associate',
  'receptionist',
  'security',
  'security officer',
  'administrative support assistant'],
 ['facilitator', 'counsellor', 'youth coordinator', 'addictions counsellor'],
 ['music educator', 'music educator', 'educator']]