In [1]:
import os
import random
import argparse
import matplotlib
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
path =  "../data/"
dataset = "cb12/"
raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"

# Step 1: Load pre-trained model

In [3]:
import gensim.downloader as api

def load_word_embeddings(path, binary=True):
    w2v_model = api.load("word2vec-google-news-300")
    return w2v_model

w2v_model = load_word_embeddings('', binary=False)

# Step 2: Load job data

In [4]:
print('Loading job from file: {}'.format(processed_path + 'jobs_14d_30_consider_user_encoded_tokenized'))
job_df_30 = pd.read_csv(processed_path + 'jobs_14d_30_consider_user_encoded_tokenized.csv', header=0, sep='\t')
print('Job data shape: ', job_df_30.shape)
print('Unique JobCity: ', len(job_df_30.JobCity.unique()))
print('Unique JobState: ', len(job_df_30.JobState.unique()))
print('Unique JobCountry: ', len(job_df_30.JobCountry.unique()))

Loading job from file: ../data/cb12/processed/jobs_14d_30_consider_user_encoded_tokenized
Job data shape:  (207972, 27)
Unique JobCity:  5744
Unique JobState:  54
Unique JobCountry:  3


# Step 3: Get word embeddings

In [5]:
from nltk import FreqDist

In [6]:
def get_words_freq(tokenized_texts):
    words_freq = FreqDist([word for text in tokenized_texts for word in text])
    return words_freq 

In [15]:
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'

def google_process_word_embedding_for_corpus_vocab(w2v_model, words_freq, keep_most_frequent_words=100000):
    print('Tokens vocab. from texts: {}'.format(len(words_freq)))
    if len(words_freq) < keep_most_frequent_words:
        most_freq_words = []
        for word, freq in words_freq.items():
            if freq > 1:
                most_freq_words.append(word)
        print('Tokens vocab. from texts with freq > 1: {}'.format(len(most_freq_words)))

    else:
        most_freq_words = []
        for word, freq in words_freq.items():
            if freq > 5:
                most_freq_words.append(word)
        print('Tokens vocab. from texts with freq > 1: {}'.format(len(most_freq_words)))


        #most_freq_words = set(list(map(lambda x: x[0], words_freq.most_common(keep_most_frequent_words))))
        print('Most common tokens vocab. from texts: {}'.format(len(most_freq_words)))

    RESERVED_TOKENS_IN_VOCAB=2
    embedding_size = w2v_model.vector_size
    new_embeddings_list = []
    new_vocab = {}

    last_token_id = RESERVED_TOKENS_IN_VOCAB
    w2v_vocab = set(w2v_model.index_to_key)
    for word in list(most_freq_words):        
        if word in list(w2v_vocab):    
            new_vocab[word] = last_token_id
            last_token_id += 1
            new_embeddings_list.append(w2v_model[word])

    #Inserting the 2 reserved tokens
    new_vocab[PAD_TOKEN] = 0
    new_vocab[UNK_TOKEN] = 1

    np.random.seed(10)
    unk_vector = np.random.uniform(low=-0.04, high=0.04, size=embedding_size)
    pad_vector = np.random.uniform(low=-0.04, high=0.04, size=embedding_size)

    new_embeddings_matrix = np.vstack([unk_vector, pad_vector] + new_embeddings_list)

    print('Most common tokens with word embeddings: {}'.format(new_embeddings_matrix.shape[0]))
    return new_vocab, new_embeddings_matrix

In [17]:
import pickle
import tensorflow as tf

def serialize(filename, obj):
    with tf.io.gfile.GFile(filename, 'wb') as handle:
        pickle.dump(obj, handle)
        
def save_word_vocab_embeddings(output_path, word_vocab, word_embeddings_matrix):
    to_serialize = (word_vocab, word_embeddings_matrix)
    serialize(output_path, to_serialize)

### Title

In [13]:
tokenized_texts_title = [eval(t) for t in job_df_30['Title_tokenized'].values.tolist()]
print('Computing word frequencies...')
# A dictionary 
words_freq_title = get_words_freq(tokenized_texts_title)
print('Number of vocabulary in {} (raw): {}'.format('Title', len(words_freq_title)))

Computing word frequencies...
Number of vocabulary in Title (raw): 19929


In [None]:
word_vocab_title, word_embeddings_matrix_title = google_process_word_embedding_for_corpus_vocab(w2v_model, words_freq_title, 100000)
print('Saving word embeddings and vocab.: {}'.format('../language_models/pickles/google_word_vocab_embeddings_14d_30_Title_consider_user.pickle'))
save_word_vocab_embeddings('../language_models/pickles/google_word_vocab_embeddings_14d_30_Title_consider_user.pickle', word_vocab_title, word_embeddings_matrix_title)

In [18]:
save_word_vocab_embeddings('../language_models/pickles/google_word_vocab_embeddings_14d_30_Title_consider_user.pickle', word_vocab_title, word_embeddings_matrix_title)

### All

In [19]:
tokenized_texts_all = [eval(t) for t in job_df_30['All_tokenized'].values.tolist()]
print('Computing word frequencies...')
# A dictionary 
words_freq_all = get_words_freq(tokenized_texts_all)
print('Number of vocabulary in {} (raw): {}'.format('All', len(words_freq_all)))

Computing word frequencies...
Number of vocabulary in All (raw): 820935


In [None]:
word_vocab_all, word_embeddings_matrix_all = google_process_word_embedding_for_corpus_vocab(w2v_model, words_freq_all, 100000)
print('Saving word embeddings and vocab.: {}'.format('../language_models/pickles/google_word_vocab_embeddings_14d_30_All_consider_user.pickle'))
save_word_vocab_embeddings('../language_models/pickles/google_word_vocab_embeddings_14d_30_All_consider_user.pickle', word_vocab_all, word_embeddings_matrix_all)

Tokens vocab. from texts: 820935
Tokens vocab. from texts with freq > 1: 97696
Most common tokens vocab. from texts: 97696


# Step 4: Convert tokens into int numbers

In [None]:
print('Converting tokens to int numbers (according to the vocab.)...')
texts_int_title, texts_lengths_title = convert_tokens_to_int(tokenized_texts_title, word_vocab_titletijob_df_30)
job_df_30['Title_length'] = texts_lengths_title
job_df_30['Title_int'] = texts_int_title

texts_int_title, texts_lengths_title = convert_tokens_to_int(tokenized_texts_title, word_vocab_titletijob_df_30)
job_df_30['Title_length'] = texts_lengths_title
job_df_30['Title_int'] = texts_int_title

# Step 5: Export to tf and df

In [None]:
data_to_export_df = news_df[['id', 'url', #For debug
                                'id_encoded', 
                                'category0_encoded',
                                'category1_encoded',
                                'keywords_encoded',
                                'author_encoded',
                                'concepts_encoded',
                                'entities_encoded',
                                'locations_encoded',
                                'persons_encoded',
                                'created_at_ts',
                                'text_length', 
                                'text_int']]