Tokenize content and save

In [1]:
import os
import random
import pickle
import argparse
import matplotlib
import numpy as np
import pandas as pd
import tensorflow as tf

from tqdm import tqdm

In [2]:
path =  "../data/"
dataset = "cb12/"
raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"

# Step 1: Load job data

In [3]:
print('Loading job from file: {}'.format(processed_path + 'jobs_14d_30_consider_user_encoded.csv'))
job_df_30 = pd.read_csv(processed_path + 'jobs_14d_30_consider_user_encoded.csv', header=0, sep='\t')
print('Job data shape: ', job_df_30.shape)
print('Unique JobCity: ', len(job_df_30.JobCity.unique()))
print('Unique JobState: ', len(job_df_30.JobState.unique()))
print('Unique JobCountry: ', len(job_df_30.JobCountry.unique()))

Loading job from file: ../data/cb12/processed/jobs_14d_30_consider_user_encoded.csv
Job data shape:  (207972, 23)
Unique JobCity:  5744
Unique JobState:  54
Unique JobCountry:  3


# Step 2: Tokenize

In [4]:
print('Tokenizing texts and Converting tokens to int numbers...')

import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from string import digits, punctuation
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors


def tokenize_text(text, clean_str_fn):
    text = clean_str_fn(text)
    tokenized_text = []
    if type(text) != float:
        text = text.strip(' ')
        remove_punctuations = str.maketrans('', '', punctuation)
        text = text.translate(remove_punctuations)
        remove_digits = str.maketrans('', '', digits)
        text = text.translate(remove_digits)
        tokens = text.split(' ')
        for token in tokens:
            if token and token not in stopwords.words('english'):
                tokenized_text.append(token)

    return tokenized_text


def tokenize_texts(texts, tokenization_fn=None, clean_str_fn=lambda x: x):
    if tokenization_fn == None:
        tokenized_texts = [tokenize_text(text, clean_str_fn) for text in texts]
    else:
        tokenized_texts = [tokenization_fn(text) for text in texts]

    return tokenized_texts


def get_words_freq(tokenized_texts):
    words_freq = FreqDist([word for text in tokenized_texts for word in text])
    return words_freq 

Tokenizing texts and Converting tokens to int numbers...


### Title

In [5]:
content_type = 'Title'
print('Tokenizing texts...')
tokenized_texts_title = tokenize_texts(job_df_30[content_type].values)
print('Computing word frequencies...')
# A dictionary 
words_freq_title = get_words_freq(tokenized_texts_title)
print('Number of vocabulary in {} (raw): {}'.format(content_type, len(words_freq_title)))

Tokenizing texts...
Computing word frequencies...
Number of vocabulary in Title (raw): 19929


In [6]:
job_df_30['Title_tokenized'] = tokenized_texts_title

### Description

In [8]:
content_type = 'Description'
print('Tokenizing texts...')
tokenized_texts_description = tokenize_texts(job_df_30[content_type].values)
print('Computing word frequencies...')
# A dictionary 
words_freq_description = get_words_freq(tokenized_texts_description)
print('Number of vocabulary in {} (raw): {}'.format(content_type, len(words_freq_description)))

Tokenizing texts...
Computing word frequencies...
Number of vocabulary in Description (raw): 567788


In [9]:
job_df_30['Description_tokenized'] = tokenized_texts_description

### Requirements

In [12]:
content_type = 'Requirements'
print('Tokenizing texts...')
tokenized_texts_requirements = tokenize_texts(job_df_30[content_type].values)
print('Computing word frequencies...')
# A dictionary 
words_freq_requirements = get_words_freq(tokenized_texts_requirements)
print('Number of vocabulary in {} (raw): {}'.format(content_type, len(words_freq_requirements)))

Tokenizing texts...
Computing word frequencies...
Number of vocabulary in Requirements (raw): 275779


In [13]:
job_df_30['Requirements_tokenized'] = tokenized_texts_requirements

### All

In [14]:
content_type = 'All'
print('Tokenizing texts...')
tokenized_texts_all = tokenize_texts(job_df_30[content_type].values)
print('Computing word frequencies...')
# A dictionary 
words_freq_all = get_words_freq(tokenized_texts_all)
print('Number of vocabulary in {} (raw): {}'.format(content_type, len(tokenized_texts_all)))

Tokenizing texts...
Computing word frequencies...
Number of vocabulary in All (raw): 207972


In [15]:
job_df_30['All_tokenized'] = tokenized_texts_all

# Step 3: Save

In [16]:
job_df_30.to_csv(processed_path + 'jobs_14d_30_consider_user_encoded_tokenized.csv', sep='\t', index=False)