In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import os
import json
from bidict import bidict
import random
from math import ceil
import pickle
from bson.objectid import ObjectId
from collections import Counter, defaultdict
import re
from nltk.corpus import stopwords

In [2]:
df = pd.read_pickle("/data/rali7/Tmp/solimanz/data/pickles/clean_2017_11_28.pkl")

In [3]:
func_counts = df.transformed.value_counts()

In [4]:
top_550 = func_counts[:550]

In [5]:
bad_ids = df[~df.transformed.isin(top_550.index)]["_id"].unique()
all_ids = df["_id"].unique()
dataset_ids = list(set(all_ids) - set(bad_ids))

In [6]:
print(f"Size of entire dataset: {len(dataset_ids)}")

Size of entire dataset: 120371


In [7]:
dataset_size = len(dataset_ids)
train_size = ceil(0.8 * dataset_size) 
test_size = ceil(0.2 * dataset_size)

In [8]:
random.seed(1234)

In [9]:
train_idx = random.sample(range(dataset_size), train_size)
test_idx = random.sample(range(dataset_size), test_size)

In [10]:
train_ids = [dataset_ids[i] for i in train_idx] 
test_ids = [dataset_ids[i] for i in test_idx]

Create a mapping between job title id and string representation

In [7]:
job_titles = top_550.index.values
title_id = {title: i for i, title in enumerate(job_titles)}

Keep a record of the training and testing IDs for later experiments

In [11]:
with open("/data/rali7/Tmp/solimanz/data/datasets/train_ids.pkl", "wb")as f:
    pickle.dump(file=f, obj=train_ids)
with open("/data/rali7/Tmp/solimanz/data/datasets/test_ids.pkl", "wb")as f:
    pickle.dump(file=f, obj=test_ids)

# DS1: Simple Job Titles Sequences

In [13]:
ds1_path = "/data/rali7/Tmp/solimanz/data/datasets/1/"
ds1_file_name = "title_sequences"

In [14]:
df = data[data._id.isin(dataset_ids)]

In [15]:
func_series = df.groupby('_id')['transformed'].apply(lambda x: list(reversed(list(x))))

In [16]:
train_data = [[title_id[title] for title in func_series[i]] for i in train_ids]
test_data = [[title_id[title] for title in func_series[i]] for i in test_ids]

In [17]:
max_train_seq = max([len(seq) for seq in train_data])
max_test_seq = max([len(seq) for seq in test_data])

### Length of Longest Sequence

In [18]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 24
Maximum length of test sequences: 32


Dump to JSON

In [19]:
data = {
        'title_to_id': title_id,
        'train_data': train_data,
        'test_data': test_data,
        'maximum_seq_len': max(max_train_seq, max_test_seq)
    }

with open(os.path.join(ds1_path, f"{ds1_file_name}.json"), 'w') as f:
    json.dump(data, f)

# DS2: Add Start of Sequence tags to DS1 Sequences

In [34]:
ds2_path = "/data/rali7/Tmp/solimanz/data/datasets/2/"
ds2_file_name = "title_sequences"

In [35]:
with open(os.path.join(ds1_path, f"{ds1_file_name}.json"), 'r') as f:
    data = json.load(f)

In [36]:
title_id = data["title_to_id"]
train_data = data["train_data"] 
test_data = data["test_data"]

In [37]:
title_id['<START>'] = len(title_id)
start_tag = title_id['<START>']
_ =[seq.insert(0, start_tag) for seq in train_data]
_ =[seq.insert(0, start_tag) for seq in test_data]

In [38]:
max_train_seq = max([len(seq) for seq in train_data])
max_test_seq = max([len(seq) for seq in test_data])

### Length of Longest Sequence

In [39]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 25
Maximum length of test sequences: 33


Dump JSON

In [41]:
data = {
        'title_to_id': title_id,
        'train_data': train_data,
        'test_data': test_data,
        'maximum_seq_len': max(max_train_seq, max_test_seq)
    }
with open(os.path.join(ds2_path, f"{ds2_file_name}.json"), 'w') as f:
    json.dump(data, f)

# DS3: Job Sequences as Sequences of Bag-of-Words

In this dataset, we represent our job experience sequences as sequences of bag-of-words vectors that we will feed to the LSTM

In [8]:
ds3_path = "/data/rali7/Tmp/solimanz/data/datasets/3/"
ds3_file_name = "title_sequences"

In [9]:
with open("/data/rali7/Tmp/solimanz/data/datasets/train_ids.pkl", "rb")as f:
    train_ids = pickle.load(file=f)
with open("/data/rali7/Tmp/solimanz/data/datasets/test_ids.pkl", "rb")as f:
    test_ids = pickle.load(file=f)

In [10]:
joined = " ".join(top_550.index.values)
tokens = re.split(r"[-/,\.\\\s]", joined)
token_counts = Counter(tokens)

In [11]:
vocab = set(token_counts.keys())

In [12]:
sw = stopwords.words('english')

In [14]:
for word in sw:
    if word in vocab:
        vocab.remove(word)

In [16]:
if '' in vocab:
    vocab.remove('')

In [18]:
vocab_id = {l: i for i, l in enumerate(vocab)}

In [19]:
voc_size = len(vocab_id)
bow = {}
print(f"Vocabulary Size: {voc_size}")

Vocabulary Size: 329


In [20]:
for title in func_counts.index:
    tokens = re.split(r"[-/,\.\\\s]", title)
    token_indices = [vocab_id[tok] for tok in tokens if tok in vocab_id]
    bow[title] = sorted(token_indices)

In [21]:
df = df[df._id.isin(dataset_ids)]

In [22]:
func_series = df.groupby('_id')['transformed'].apply(lambda x: list(reversed(list(x))))

In [23]:
train_seqs = [[title for title in func_series[i]] for i in train_ids]
test_seqs = [[title for title in func_series[i]] for i in test_ids]

In [24]:
# Inputs
train_inputs = [[bow[title] for title in seq[:-1]] for seq in train_seqs] 
test_inputs = [[bow[title] for title in seq[:-1]] for seq in test_seqs]

# Targets
train_targets = [[title_id[title] for title in seq[1:]] for seq in train_seqs]
test_targets = [[title_id[title] for title in seq[1:]] for seq in test_seqs]

In [25]:
max_train_seq = max([len(seq) for seq in train_seqs])
max_test_seq = max([len(seq) for seq in test_seqs])

In [26]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 24
Maximum length of test sequences: 32


Dump to JSON

In [27]:
data = {
        'title_to_id': title_id,
        'title_to_bow': bow,
        'vocab_id': vocab_id,
        'train_inputs': train_inputs,
        'test_inputs': test_inputs,
        'train_targets': train_targets,
        'test_targets': test_targets,
        'maximum_seq_len': max(max_train_seq, max_test_seq)
    }
with open(os.path.join(ds3_path, f"{ds3_file_name}.json"), 'w') as f:
    json.dump(data, f)

# DS4: Add Job Duration to DS3 Feature Vectors

In [28]:
ds4_path = "/data/rali7/Tmp/solimanz/data/datasets/4/"
ds4_file_name = "title_sequences_durations"

In [29]:
df = df[df._id.isin(dataset_ids)]

AttributeError: 'dict' object has no attribute '_id'

In [30]:
func_series = df.groupby('_id')['transformed'].apply(lambda x: list(reversed(list(x))))
duration_series = df.groupby('_id')['duration'].apply(lambda x: list(reversed(list(x))))

In [31]:
func_series['52b31c980b045119318b9d64']

['director', 'instructor', 'instructor', 'substitute teacher']

In [32]:
duration_series[324]

[15.17, 0.0, 5.83, 11.75]

In [33]:
train_seqs = [[title for title in func_series[i]] for i in train_ids]
test_seqs = [[title for title in func_series[i]] for i in test_ids]

In [34]:
# Inputs we will concatenate the duration value with the corresponding bow in the batcher
train_inputs = [[bow[title] for title in seq[:-1]] for seq in train_seqs]
train_duration = [[dur for dur in duration_series[i][:-1]] for i in train_ids]
test_inputs = [[bow[title] for title in seq[:-1]] for seq in test_seqs]
test_duration = [[dur for dur in duration_series[i][:-1]] for i in test_ids]

# Targets
train_targets = [[title_id[title] for title in seq[1:]] for seq in train_seqs]
test_targets = [[title_id[title] for title in seq[1:]] for seq in test_seqs]

In [35]:
max_train_seq = max([len(seq) for seq in train_seqs])
max_test_seq = max([len(seq) for seq in test_seqs])

In [36]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 24
Maximum length of test sequences: 32


Dump to JSON

In [37]:
data = {
        'title_to_id': title_id,
        'title_to_bow': bow,
        'vocab_id': vocab_id,
        'train_inputs': train_inputs,
        'train_durations': train_duration,
        'test_inputs': test_inputs,
        'test_duration': test_duration,
        'train_targets': train_targets,
        'test_targets': test_targets,
        'maximum_seq_len': max(max_train_seq, max_test_seq)
    
    }
with open(os.path.join(ds4_path, f"{ds4_file_name}.json"), 'w') as f:
    json.dump(data, f)

# DS5: Input Vectors as Word Embeddings (Fasttext)

In [84]:
ds5_path = "/data/rali7/Tmp/solimanz/data/datasets/5/"
ds5_file_name = "title_embedding_sequences"

# DS6: Multi Label Data Representation (Larger Dataset)

In [34]:
ds6_path = "/data/rali7/Tmp/solimanz/data/datasets/6/"
ds6_file_name = "multilabel_title_sequences"

In [71]:
def reduce(s):
    if pd.isna(s) or pd.isnull(s):
        return np.nan
    tokens = re.split(r"[-/,\.\\\s]", s)
    token_indices = [tok for tok in tokens if tok in label_id]
    if token_indices:
        return " ".join(token_indices)
    else:
        return np.nan

In [72]:
data['reduced'] = data["transformed"].apply(reduce)

In [77]:
good = set(all_ids) - set(bad)

In [79]:
good = list(good)

In [82]:
reduced_func = data[data._id.isin(good)].reduced.value_counts()

In [88]:
top_titles = reduced_func.index.values

In [64]:
input_seqs = np.zeros((5, 5, 331), dtype=np.int32)

In [80]:
for i, bow in enumerate(train_inputs[0]):
    input_seqs[0][i][bow] = 1

In [68]:
train_inputs[0]

[[53], [125], [125]]

In [84]:
train_targets[0]

[19, 19, 19]

In [40]:
a = np.zeros((4,5), dtype=np.int32)

In [44]:
x = [4,23,9]

In [45]:
a[2, :len(x)] = x

In [46]:
a

array([[ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 4, 23,  9,  0,  0],
       [ 0,  0,  0,  0,  0]], dtype=int32)

In [48]:
x[1] = 988

In [49]:
x

[4, 988, 9]

In [50]:
a

array([[ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 4, 23,  9,  0,  0],
       [ 0,  0,  0,  0,  0]], dtype=int32)