In [37]:
import pandas as pd
import numpy as np
import os
import json
from bidict import bidict
import random
from math import ceil
import pickle
from bson.objectid import ObjectId

In [2]:
data = pd.read_pickle("/data/rali7/Tmp/solimanz/data/pickles/clean_2017_11_28.pkl")

In [3]:
func_counts = data.transformed.value_counts()

# DS1: Simple Job Titles Sequences

In [4]:
ds1_path = "/data/rali7/Tmp/solimanz/data/datasets/1/"
ds1_file_name = "title_sequences"

In [5]:
top_550 = func_counts[:550]

In [6]:
bad_ids = data[~data.transformed.isin(top_550.index)]["_id"].unique()
all_ids = data["_id"].unique()
dataset_ids = list(set(all_ids) - set(bad_ids))

In [7]:
print(f"Number of sequences: {len(dataset_ids)}")

Number of sequences: 120330


In [8]:
dataset_size = len(dataset_ids)
train_size = ceil(0.8 * dataset_size) 
test_size = ceil(0.2 * dataset_size)

In [10]:
random.seed(1234)

In [11]:
train_idx = random.sample(range(dataset_size), train_size)
test_idx = random.sample(range(dataset_size), test_size)

In [12]:
train_ids = [dataset_ids[i] for i in train_idx] 
test_ids = [dataset_ids[i] for i in test_idx]

Keep a record of the training and testing IDs for later experiments

In [15]:
with open("/data/rali7/Tmp/solimanz/data/datasets/1/train_ids.pkl", "wb")as f:
    pickle.dump(file=f, obj=train_ids)
with open("/data/rali7/Tmp/solimanz/data/datasets/1/test_ids.pkl", "wb")as f:
    pickle.dump(file=f, obj=test_ids)

In [16]:
data = data[data._id.isin(dataset_ids)]

Create a mapping between job title id and string representation

In [17]:
job_titles = data.transformed.unique()
title_id = {title: i for i, title in enumerate(job_titles)}

In [18]:
func_series = data.groupby('_id')['transformed'].apply(lambda x: (list(x)))

In [43]:
func_series[11]

['supervisor', 'tutor', 'store manager']

In [42]:
data[data["_id"] == ObjectId('52b31a960b045119318b489e')]

Unnamed: 0,_id,duration,function,industry,job_index,mission,place,transformed
3852,52b31a960b045119318b489e,5.0,store manager,,0,,,store manager
3853,52b31a960b045119318b489e,1.0,tutor,,1,,"London, Canada Area",tutor
3854,52b31a960b045119318b489e,1.25,supervisor,,2,,,supervisor


In [52]:
train_data = [[title_id[title] for title in func_series[i]] for i in train_ids]
test_data = [[title_id[title] for title in func_series[i]] for i in test_ids]

In [55]:
max_train_seq = max([len(seq) for seq in train_data])
max_test_seq = max([len(seq) for seq in test_data])

In [56]:
print(f"Maximum length of training sequences : {max_train_seq}\nMaximum length of test sequences: {max_test_seq}")

Maximum length of training sequences : 32
Maximum length of test sequences: 19


Dump to JSON

In [57]:
data = {
        'title_to_id': title_id,
        'train_data': train_data,
        'test_data': test_data
    }
with open(os.path.join(ds1_path, f"{ds1_file_name}.json"), 'w') as f:
    json.dump(data, f)

# DS2: Add Start of Sequence tags to DS1 Sequences

# DS3: Job Sequences as Sequences of Bag-of-Words

In this dataset, we represent our job experience sequences as sequences of bag-of-words vectors that we will feed to the LSTM