In [39]:
import os
import random
import pickle
import argparse
import matplotlib
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
path =  "../data/"
dataset = "cb12/"
raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"

# Step 1: Load job data

In [42]:
print('Loading job from file: {}'.format(processed_path + 'jobs_14d_30_consider_user_encoded_tokenized'))
job_df_30 = pd.read_csv(processed_path + 'jobs_14d_30_consider_user_encoded_tokenized.csv', header=0, sep='\t')
print('Job data shape: ', job_df_30.shape)
print('Unique JobCity: ', len(job_df_30.JobCity.unique()))
print('Unique JobState: ', len(job_df_30.JobState.unique()))
print('Unique JobCountry: ', len(job_df_30.JobCountry.unique()))

Loading job from file: ../data/cb12/processed/jobs_14d_30_consider_user_encoded_tokenized
Job data shape:  (207972, 27)
Unique JobCity:  5744
Unique JobState:  54
Unique JobCountry:  3


# Step 2: Get pre-tokenized text

In [4]:
from nltk import FreqDist

In [5]:
def get_words_freq(tokenized_texts):
    words_freq = FreqDist([word for text in tokenized_texts for word in text])
    return words_freq

### Title

In [23]:
tokenized_texts = [eval(t) for t in job_df_30['Title_tokenized'].values.tolist()]
print('Computing word frequencies...')
words_freq= get_words_freq(tokenized_texts)
print('Number of vocabulary in {} (raw): {}'.format('Title', len(words_freq)))

Computing word frequencies...
Number of vocabulary in Title (raw): 19929


### All

In [24]:
tokenized_texts_all = [eval(t) for t in job_df_30['All_tokenized'].values.tolist()]
print('Computing word frequencies...')
# A dictionary 
words_freq_all = get_words_freq(tokenized_texts_all)
print('Number of vocabulary in {} (raw): {}'.format('All', len(words_freq_all)))

Computing word frequencies...
Number of vocabulary in All (raw): 820935


# Step 3: Train

In [25]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [26]:
print('Processing documents...')
tagged_data = [TaggedDocument(words=w, tags=[i]) for i, w in enumerate(tokenized_texts_all)]  

Processing documents...


In [27]:
print('Training doc2vec')
max_epochs = 30
vec_size = 300
alpha = 0.025
model = Doc2Vec(
    vector_size=vec_size,
    alpha=alpha, 
    min_alpha=alpha,   
    window=5,
    negative=5,
    min_count=2,                                     
    max_vocab_size=100000,
    dm = 1,
    dm_mean=1,
    workers=6)

Training doc2vec


In [28]:
model.build_vocab(tagged_data)

In [29]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data, total_examples=model.corpus_count, epochs=1) 
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

del tokenized_texts

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29


In [31]:
print('Concatenating job content embeddings, making sure that they are sorted by the encoded JobID')
job_content_embeddings = np.vstack([model.docvecs[i-1] for i in job_df_30['JobID_encoded'].values])    
embedding_for_padding_job = np.mean(job_content_embeddings, axis=0)
content_job_embeddings_with_padding = np.vstack([embedding_for_padding_job, job_content_embeddings])
del job_content_embeddings

Concatenating job content embeddings, making sure that they are sorted by the encoded JobID


  


In [33]:
def serialize(filename, obj):
    with tf.io.gfile.GFile(filename, 'wb') as handle:
        pickle.dump(obj, handle)

def export_job_content_embeddings(content_job_embeddings, output_job_content_embeddings_path):
    output_path = output_job_content_embeddings_path
    print('Exporting job embeddings to {}'.format(output_path))
    #to_serialize = (acr_label_encoders, articles_metadata_df, content_article_embeddings)
    to_serialize = content_job_embeddings
    serialize(output_path, to_serialize)

In [43]:
#Checking if content job embedding size correspond to the last JobID
assert content_job_embeddings_with_padding.shape[0] == job_df_30['JobID_encoded'].tail(1).values[0]+1
print('Exporting job content embeddings')
del job_df_30
export_job_content_embeddings(content_job_embeddings_with_padding, '../language_models/pickles/jobs_14d_30_consider_user_All_d2v.pickle')

Exporting job content embeddings
Exporting job embeddings to ../language_models/pickles/jobs_14d_30_consider_user_All_d2v.pickle
