In [43]:
import pandas as pd
import numpy as np
from pyfasttext import FastText
from collections import Counter
from pprint import pprint
import os
import json
import string
from bidict import bidict
import random
from math import ceil, floor
import pickle
from bson.objectid import ObjectId
from collections import Counter, defaultdict
import re
import ftfy
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

In [2]:
%matplotlib inline
plt.style.use('ggplot')

In [6]:
df = pd.read_pickle("/data/rali7/Tmp/solimanz/data/pickles/excerpt-2017-02-20_reduced.pkl")

In [4]:
companies = df.company_name.value_counts()

In [7]:
import errno
def dump(path, serializer, obj):
    """
    Saves 'obj' to 'path' using 'serializer' (either pickle or json)
    """
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    mode = "wb" if serializer.__name__ == "pickle" else "w"
    with open(path, mode) as f:
        serializer.dump(f, obj)

In [8]:
def get_ids(n=550, col='transformed'):
    top = df[col].value_counts()[:n]
    bad_ids = df[~df[col].isin(top.index)]["_id"].unique()
    all_ids = df["_id"].unique()
    dataset_ids = list(set(all_ids) - set(bad_ids))
    
    return dataset_ids

In [9]:
def split_data(dataset_ids, seed=123):
    random.seed(seed)
    train_size = ceil(0.8 * len(dataset_ids))
    random.shuffle(dataset_ids)    
    train_ids = dataset_ids[:train_size]
    test_ids = dataset_ids[train_size:]
    
    valid_size = ceil(0.2 * len(train_ids))
    random.shuffle(train_ids)
    valid_ids = train_ids[:valid_size]
    train_ids = train_ids[valid_size:]
    
    return train_ids, valid_ids, test_ids

In [8]:
def save_ids(train_ids, valid_ids, test_ids, path="/data/rali7/Tmp/solimanz/LBJ/dataset"):
    with open(os.path.join(path, 'train', 'train_ids.pkl'), "wb")as f:
        pickle.dump(file=f, obj=train_ids)
    with open(os.path.join(path, 'valid', 'valid_ids.pkl'), "wb")as f:
        pickle.dump(file=f, obj=valid_ids)
    with open(os.path.join(path, 'test', 'test_ids.pkl'), "wb")as f:
        pickle.dump(file=f, obj=test_ids)

In [9]:
dataset_ids = get_ids(n=550, col='reduced')
train_ids, valid_ids, test_ids = split_data(dataset_ids)

In [10]:
print(f"Size of entire dataset: {len(dataset_ids)}")
print(f"Size of train dataset: {len(train_ids)}")
print(f"Size of valid dataset: {len(valid_ids)}")
print(f"Size of test dataset: {len(test_ids)}")

Size of entire dataset: 266285
Size of train dataset: 170422
Size of valid dataset: 42606
Size of test dataset: 53257


Keep a record of the training and testing IDs for later experiments

In [11]:
save_ids(train_ids, valid_ids, test_ids)

Load IDs

In [2]:
def load_ids(path="/data/rali7/Tmp/solimanz/LBJ/dataset"):
    with open(os.path.join(path, 'train', 'train_ids.pkl'), "rb")as f:
        train_ids = pickle.load(f)
    with open(os.path.join(path, 'valid', 'valid_ids.pkl'), "rb")as f:
        valid_ids = pickle.load(f)
    with open(os.path.join(path, 'test', 'test_ids.pkl'), "rb")as f:
        test_ids = pickle.load(f)
        
    return train_ids, valid_ids, test_ids

In [3]:
train_ids, valid_ids, test_ids = load_ids()
dataset_ids = train_ids + valid_ids + test_ids

In [10]:
df = df[df._id.isin(dataset_ids)]

In [11]:
job_titles = df.reduced.unique()

In [12]:
func_series = df.groupby('_id')['reduced'].apply(lambda x: list(reversed(list(x))))

Create a mapping between job title id and string representation

In [13]:
title_id = {title: i for i, title in enumerate(job_titles)}

In [25]:
# import itertools

# train_data = set(itertools.chain.from_iterable([func_series[i] for i in train_ids]))
# valid_data = set(itertools.chain.from_iterable([func_series[i] for i in valid_ids]))
# test_data = set(itertools.chain.from_iterable([func_series[i] for i in test_ids]))

In [14]:
train_seq = [[i, [title_id[func] for func in func_series[i]]] for i in train_ids]
valid_seq = [[i, [title_id[func] for func in func_series[i]]] for i in valid_ids]
test_seq = [[i, [title_id[func] for func in func_series[i]]] for i in test_ids]

In [17]:
# train_data = [{"sequence": ">".join(func_series[i][:-1]), "labels": ">".join(func_series[i][1:])} for i in train_ids]
# valid_data = [{"sequence": ">".join(func_series[i][:-1]), "labels": ">".join(func_series[i][1:])} for i in valid_ids]
# test_data = [{"sequence": ">".join(func_series[i][:-1]), "labels": ">".join(func_series[i][1:])} for i in test_ids]

In [17]:
max_train_seq = max([len(dat[1]) for dat in train_seq])
max_valid_seq = max([len(dat[1]) for dat in valid_seq])
max_test_seq = max([len(dat[1]) for dat in test_seq])

In [18]:
print(f"""Maximum length of training sequences : {max_train_seq}
Maximum length of valid sequences : {max_valid_seq}
Maximum length of test sequences: {max_test_seq}""")

Maximum length of training sequences : 32
Maximum length of valid sequences : 31
Maximum length of test sequences: 31


#### Dump JSON

In [19]:
data_path = "/data/rali7/Tmp/solimanz/LBJ/dataset/"

train_data = {
        'sequences': train_seq,
        'maximum_seq_len':max_train_seq,
        'title_id': dict(title_id),
        'n_labels': len(title_id)
    }

valid_data = {
        'sequences': valid_seq,
        'maximum_seq_len': max_valid_seq,
        'title_id': dict(title_id),
        'n_labels': len(title_id)
    }

test_data = {
        'sequences': test_seq,
        'maximum_seq_len': max_test_seq,
        'title_id': dict(title_id),
        'n_labels': len(title_id)
    }

# data = {
#         'train_data': train_data,
#         'valid_data': valid_data,
#         'test_data': test_data,
#         'maximum_seq_len': max(max_train_seq, max_valid_seq, max_test_seq),
#         'title_id': title_id,
#         'n_labels': len(title_id)
#     }

# with open(os.path.join(data_path, 'train', 'train.json'), 'w') as f:
#     for d in train_data:
#         json_data = json.dumps(d)
#         f.write(json_data)
#         f.write('\n')
# with open(os.path.join(data_path, 'valid', 'valid.json'), 'w') as f:
#     for d in valid_data:
#         json_data = json.dumps(d)
#         f.write(json_data)
#         f.write('\n')
# with open(os.path.join(data_path, 'test', 'test.json'), 'w') as f:
#     for d in test_data:
#         json_data = json.dumps(d)
#         f.write(json_data)
#         f.write('\n')

In [20]:
with open(os.path.join(data_path, 'train', 'train.json'), 'w') as f:
    json.dump(train_data, f)
with open(os.path.join(data_path, 'valid', 'valid.json'), 'w') as f:
    json.dump(valid_data, f)
with open(os.path.join(data_path, 'test', 'test.json'), 'w') as f:
    json.dump(test_data, f)
# with open(os.path.join(data_path, 'data.json'), 'w') as f:
#     json.dump(data, f)

## Input Vectors as Word Embeddings Mutli Label Targets(Fasttext)

In [None]:
model = FastText("/data/rali7/Tmp/solimanz/LBJ/crawl-300d-2M-subword.bin")

In [None]:
emb_dim = 300

In [23]:
title_id = bidict(title_id)
# train_seqs = data["train_data"]
# valid_seqs = data["valid_data"]
# test_seqs = data["test_data"]

In [24]:
embeddings = np.zeros((len(title_id), emb_dim), dtype=np.float32)

for title in title_id.keys():
    vec = model.get_sentence_vector(title)
    embeddings[title_id[title], :] = vec

In [25]:
np.save(os.path.join("/data/rali7/Tmp/solimanz/LBJ/dataset/", "embeddings_reduced.npy"), embeddings)

# Skill Embeddings

In [26]:
with open(os.path.join(data_path, 'skills.pkl'), 'rb') as f:
    skills = pickle.load(f)

In [27]:
all_skills = []
for v in skills.values():
    if v:
        all_skills += v

In [52]:
all_skills = [skill.lower() for skill in all_skills]

In [53]:
skill_counts = Counter(all_skills)

In [47]:
def get_token_ids(id_skills):
    """ Returns dict that associates every token to an id"""
    sw = set(stopwords.words('english') + ['...'])
    tokenizer = RegexpTokenizer(r'\s+|\W', gaps=True) 
    skills = []    
    for s in id_skills.values():
        if s:
            skills += s
    
    tokens = list(set(tokenizer.tokenize(" ".join(skills))))
    tokens = [t.lower() for t in tokens if t not in string.punctuation and t not in sw]
    
    return {token: id_ for id_, token in enumerate(tokens)}

In [48]:
def get_skill_tokens(id_skills):
    
    tokenizer = RegexpTokenizer(r'\s+|\W', gaps=True)    
    for key, skills in id_skills.items():
        if skills:
            tokens = tokenizer.tokenize(" ".join(skills))
            id_skills[key] = tuple(set(tokens))
        
    return id_skills

In [58]:
id_skills = {k: sorted([s.lower() for skillset], key=lambda s: skill_counts[s], reverse=True)[:10] for k, skillset in skills.items() if skillset}

In [72]:
all_skills = []
for v in id_skills.values():
    all_skills += v

In [74]:
skill_counts = Counter(all_skills)

In [76]:
skill_id = {skill: i for i, skill in enumerate(skill_counts.keys())}

In [99]:
skill_id['<null>'] = len(skill_id)

In [100]:
embeddings = np.zeros((len(skill_id), emb_dim), dtype=np.float32)
for skill in skill_id.keys():
    if not skill == '<null>':
        vec = model.get_sentence_vector(skill)
        embeddings[skill_id[skill], :] = vec

In [101]:
embeddings.shape

(42405, 300)

In [103]:
np.save("/data/rali7/Tmp/solimanz/LBJ/dataset/skill_emb.npy", embeddings)

In [107]:
ls = [len(s) for s in id_skills.values()]

In [109]:
max(ls)

10

In [106]:
with open(os.path.join(data_path, 'train', 'train.json'), 'r') as f:
    train_data = json.load(f)
with open(os.path.join(data_path, 'valid', 'valid.json'), 'r') as f:
    valid_data = json.load(f)
with open(os.path.join(data_path, 'test', 'test.json'), 'r') as f:
    test_data = json.load(f)

In [111]:
def augment_data(data):
    for seq in data['sequences']:
        if seq[0] in id_skills:
            seq.append([skill_id[s] for s in id_skills[seq[0]]])
        else:
            seq.append([skill_id['<null>']])
    
    data['max_n_skills'] = 10
    data['emb_path'] = "/data/rali7/Tmp/solimanz/LBJ/dataset/skill_emb.npy"
    data['skill_id'] = skill_id
    
    return data

In [112]:
train_data = augment_data(train_data)
valid_data = augment_data(valid_data)
test_data = augment_data(test_data)

In [114]:
with open(os.path.join(data_path, 'train', 'train.json'), 'w') as f:
    json.dump(train_data, f)
with open(os.path.join(data_path, 'valid', 'valid.json'), 'w') as f:
    json.dump(valid_data, f)
with open(os.path.join(data_path, 'test', 'test.json'), 'w') as f:
    json.dump(test_data, f)

In [115]:
train_data.keys()

dict_keys(['sequences', 'maximum_seq_len', 'title_id', 'n_labels', 'max_n_skills', 'emb_path', 'skill_id'])

# Education Embeddings

In [51]:
with open(os.path.join(data_path, 'education.pkl'), 'rb') as f:
    education = pickle.load(f)

In [70]:
edu = defaultdict(list)
for key, edu_list in education.items():
    if not edu_list:
        edu[key] = None
    else:
        for e in edu_list:
            if 'schoolName' in e:
                edu[key].append(e['schoolName'])
            if 'name'in e:
                edu[key].append(e['name'])
            if 'sector' in e:
                edu[key].append(e['sector'])