Encode each attribute

In [1]:
import random
import pickle
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'

In [3]:
path =  "../data/"
dataset = "cb12/"

raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"

# Step 1: User data

In [7]:
user_df_30 = pd.read_csv(processed_path + 'users_14d_30_consider_user.csv', sep='\t')
print(user_df_30.shape)
print('Unique UserCity: ', len(user_df_30.UserCity.unique()))
print('Unique UserState: ', len(user_df_30.UserState.unique()))
print('Unique UserCountry: ', len(user_df_30.UserCountry.unique()))
print('Unique UserDegree: ', len(user_df_30.UserDegree.unique()))
print('Unique UserMajor: ', len(user_df_30.UserMajor.unique()))

(111785, 15)
Unique UserCity:  6701
Unique UserState:  121
Unique UserCountry:  33
Unique UserDegree:  7
Unique UserMajor:  21224


# Step 2: Job data

In [6]:
print('Loading job from file: {}'.format(processed_path + 'job_14d_30_consider_user.csv'))
job_df_30 = pd.read_csv(processed_path + 'jobs_14d_30_consider_user.csv', header=0, sep='\t')
print('Job data shape: ', job_df_30.shape)
print('Unique JobCity: ', len(job_df_30.JobCity.unique()))
print('Unique JobState: ', len(job_df_30.JobState.unique()))
print('Unique JobCountry: ', len(job_df_30.JobCountry.unique()))

Loading job from file: ../data/cb12/processed/job_14d_30_consider_user.csv
Job data shape:  (207972, 19)
Unique JobCity:  5744
Unique JobState:  54
Unique JobCountry:  3


In [8]:
list_UserCity_30 = user_df_30.UserCity.unique().tolist()
print('UserCity: ', len(list_UserCity_30))
list_JobCity_30 = job_df_30.JobCity.unique().tolist()
print('JobCity: ', len(list_JobCity_30))
list_City_30 = list(set(list_UserCity_30) | set(list_JobCity_30))
print('TotalCity: ', len(list_City_30))

UserCity:  6701
JobCity:  5744
TotalCity:  8226


In [9]:
list_UserState_30 = user_df_30.UserState.unique().tolist()
print('UserState: ', len(list_UserState_30))
list_JobState_30 = job_df_30.JobState.unique().tolist()
print('JobState: ', len(list_JobState_30))
list_State_30 = list(set(list_UserState_30) | set(list_JobState_30))
print('TotalState: ', len(list_State_30))

UserState:  121
JobState:  54
TotalState:  122


In [10]:
list_UserCountry_30 = user_df_30.UserCountry.unique().tolist()
print('UserCountry: ', len(list_UserCountry_30))
list_JobCountry_30 = job_df_30.JobCountry.unique().tolist()
print('JobCountry: ', len(list_JobCountry_30))
list_Country_30 = list(set(list_UserCountry_30) | set(list_JobCountry_30))
print('TotalCountry: ', len(list_Country_30))

UserCountry:  33
JobCountry:  3
TotalCountry:  33


In [11]:
def encode_categ_feature(value, encoder_dict):
    if value in encoder_dict:
        return encoder_dict[value]
    else:
        return encoder_dict[get_unfrequent_token()]


def transform_categorical_column(series, encoder):
    return series.apply(lambda x: encode_categ_feature(x, encoder)) 


def get_pad_token():
    PAD_TOKEN = '<PAD>'
    return PAD_TOKEN

def get_unfrequent_token():
    UNFREQ_TOKEN = '<UNF>'
    return UNFREQ_TOKEN

def get_categ_encoder_from_values(values, include_pad_token=True, include_unfrequent_token=False):
    encoder_values = []
    if include_pad_token:
        encoder_values.append(get_pad_token())
    if include_unfrequent_token:
        encoder_values.append(get_unfrequent_token())
    encoder_values.extend(values)
    encoder_ids = list(range(len(encoder_values)))
    encoder_dict = dict(zip(encoder_values, encoder_ids))
    return encoder_dict

In [12]:
# Share between Job and User
City_encoder_30 = get_categ_encoder_from_values(list_City_30)
State_encoder_30 = get_categ_encoder_from_values(list_State_30)
Country_encoder_30 = get_categ_encoder_from_values(list_Country_30)

# Step 3: Process job features

In [13]:
print('Encoding Job attributes ...')


def process_job_features(data_df, City_encoder, State_encoder, Country_encoder):
    JobID_encoder = get_categ_encoder_from_values(data_df['JobID'])
    print('Unique Job {}'.format(len(JobID_encoder)))
    data_df['JobID_encoded'] = transform_categorical_column(data_df['JobID'], JobID_encoder)
    
    print('Unique City {}'.format(len(City_encoder)))
    data_df['JobCity_encoded'] = transform_categorical_column(data_df['JobCity'], City_encoder)
   
    print('Unique State {}'.format(len(State_encoder)))
    data_df['JobState_encoded'] = transform_categorical_column(data_df['JobState'], State_encoder)
    
    print('Unique Country {}'.format(len(Country_encoder)))
    data_df['JobCountry_encoded'] = transform_categorical_column(data_df['JobCountry'], Country_encoder)
    
    job_features_encoders = {
        'JobID': JobID_encoder, 
        'JobCity': City_encoder, 
        'JobState': State_encoder,
        'JobCountry': Country_encoder
    }
    return job_features_encoders, data_df
  
job_features_encoders_30, job_df_encoded_30 = process_job_features(job_df_30, City_encoder_30, State_encoder_30, Country_encoder_30)

Encoding Job attributes ...
Unique Job 207973
Unique City 8227
Unique State 123
Unique Country 34


In [14]:
def serialize(filename, obj):
    with tf.io.gfile.GFile(filename, 'wb') as handle:
        pickle.dump(obj, handle)
        
def save_job_feature_encoders(output_path, job_features_encoders):
    to_serialize = (job_features_encoders)
    serialize(output_path, to_serialize)

print('Saving job feature encoder...')
save_job_feature_encoders(processed_path + 'job_feature_encoders_14d_30_consider_user.pickle', job_features_encoders_30)

Saving job feature encoder...


In [15]:
job_df_encoded_30.to_csv(processed_path + 'jobs_14d_30_consider_user_encoded.csv', sep='\t', index=False)

# Step 4: Process user features

In [16]:
def process_user_features(data_df, City_encoder, State_encoder, Country_encoder):
    UserID_encoder = get_categ_encoder_from_values(data_df['UserID'])
    print('Unique User {}'.format(len(UserID_encoder)))
    data_df['UserID_encoded'] = transform_categorical_column(data_df['UserID'], UserID_encoder)
   
    print('Unique City {}'.format(len(City_encoder)))
    data_df['UserCity_encoded'] = transform_categorical_column(data_df['UserCity'], City_encoder)
   
    print('Unique State {}'.format(len(State_encoder)))
    data_df['UserState_encoded'] = transform_categorical_column(data_df['UserState'], State_encoder)
    
    print('Unique Country {}'.format(len(Country_encoder)))
    data_df['UserCountry_encoded'] = transform_categorical_column(data_df['UserCountry'], Country_encoder)

    UserDegree_encoder = get_categ_encoder_from_values(data_df['UserDegree'].unique())
    print('Unique UserDegree {}'.format(len(UserDegree_encoder)))
    data_df['UserDegree_encoded'] = transform_categorical_column(data_df['UserDegree'], UserDegree_encoder)
     
    UserMajor_encoder = get_categ_encoder_from_values(data_df['UserMajor'].unique())
    print('Unique UserMajor {}'.format(len(UserMajor_encoder )))
    data_df['UserMajor_encoded'] = transform_categorical_column(data_df['UserMajor'], UserMajor_encoder)
  
    user_features_encoders = {
        'UserID': UserID_encoder, 
        'UserCity': City_encoder, 
        'UserState': State_encoder,
        'UserCountry': Country_encoder,
        'UserDegree': UserDegree_encoder,
        'UserMajor': UserMajor_encoder
    }
    return user_features_encoders, data_df
  
user_features_encoders_30, user_df_encoded_30 = process_user_features(user_df_30, City_encoder_30, State_encoder_30, Country_encoder_30)

Unique User 111786
Unique City 8227
Unique State 123
Unique Country 34
Unique UserDegree 8
Unique UserMajor 21225


In [17]:
def save_user_feature_encoders(output_path, user_features_encoders):
    to_serialize = (user_features_encoders)
    serialize(output_path, to_serialize)

print('Saving user feature encoder...')
save_user_feature_encoders(processed_path + 'user_feature_encoders_14d_30_consider_user.pickle', user_features_encoders_30)

Saving user feature encoder...


In [18]:
user_df_encoded_30.to_csv(processed_path + 'users_14d_30_consider_user_encoded.csv', sep='\t', index=False)