Transform applications into sessions

In [1]:
import os
import json
import random
import pickle
import argparse
import matplotlib
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
path =  "../data/"
dataset = "cb12/"

raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"

# Step 1: Load feature encoders

In [3]:
def serialize(filename, obj):
    with tf.io.gfile.GFile(filename, 'wb') as handle:
        pickle.dump(obj, handle)

def deserialize(filename):
    with tf.io.gfile.GFile(filename, 'rb') as handle:
        return pickle.load(handle)
    
def load_feature_encoders(path):
    features_encoders = deserialize(path)
    return features_encoders

In [4]:
job_features_encoders_30 = load_feature_encoders(processed_path + 'job_feature_encoders_14d_30_consider_user.pickle')
user_features_encoders_30 = load_feature_encoders(processed_path + 'user_feature_encoders_14d_30_consider_user.pickle')

# Step 2: Load user meta data

In [5]:
print('Loading user from file: {}'.format(processed_path + 'users_14d_30_consider_user_encoded.csv'))
user_df_30 = pd.read_csv(processed_path + 'users_14d_30_consider_user_encoded.csv', header=0, sep='\t')
print(user_df_30.shape)

Loading user from file: ../data/cb12/processed/users_14d_30_consider_user_encoded.csv
(111785, 21)


# Step 3: Load application data

In [6]:
def encode_categ_feature(value, encoder_dict):
    if value in encoder_dict:
        return encoder_dict[value]
    else:
        return encoder_dict[get_unfrequent_token()]

def transform_categorical_column(series, encoder):
    return series.apply(lambda x: encode_categ_feature(x, encoder)) 

In [7]:
train_df_30 = pd.read_csv(processed_path + 'train_14d_30_consider_user.csv', sep='\t')
print(train_df_30.shape)
train_df_30.rename(columns={"City": "JobCity", "State": "JobState", "Country": "JobCountry"}, inplace = True)
train_df_30['JobID_encoded'] = transform_categorical_column(train_df_30['JobID'], job_features_encoders_30['JobID'])
train_df_30['JobCity_encoded'] = transform_categorical_column(train_df_30['JobCity'], job_features_encoders_30['JobCity'])
train_df_30['JobState_encoded'] = transform_categorical_column(train_df_30['JobState'], job_features_encoders_30['JobState'])
train_df_30['JobCountry_encoded'] = transform_categorical_column(train_df_30['JobCountry'], job_features_encoders_30['JobCountry'])
print(train_df_30.shape)

test_df_30 = pd.read_csv(processed_path + 'test_14d_30_consider_user.csv', sep='\t')
print(test_df_30.shape)
test_df_30.rename(columns={"City": "JobCity", "State": "JobState", "Country": "JobCountry"}, inplace = True)
test_df_30['JobID_encoded'] = transform_categorical_column(test_df_30['JobID'], job_features_encoders_30['JobID'])
test_df_30['JobCity_encoded'] = transform_categorical_column(test_df_30['JobCity'], job_features_encoders_30['JobCity'])
test_df_30['JobState_encoded'] = transform_categorical_column(test_df_30['JobState'], job_features_encoders_30['JobState'])
test_df_30['JobCountry_encoded'] = transform_categorical_column(test_df_30['JobCountry'], job_features_encoders_30['JobCountry'])
print(test_df_30.shape)

(586434, 18)
(586434, 22)
(52035, 18)
(52035, 22)


# Step 4: Add user infor

In [8]:
train_df_new_30 = pd.merge(train_df_30, user_df_30, on='UserID')
test_df_new_30 = pd.merge(test_df_30, user_df_30, on='UserID')

# Step 5: Prepare sessions

In [9]:
def prepare_session(data_df):
    sessions = []
    for session_id, rows in data_df.groupby('SessionID'):
        clicks = []
        for idx, row in rows.iterrows():
            click = {'JobID_encoded': row['JobID_encoded'],
                     'ApplicationDate': row['ApplicationDate'],
                     'WindowID': row['WindowID_x'],
                     'JobCity_encoded': row['JobCity_encoded'],
                     'JobState_encoded': row['JobState_encoded'],
                     'JobCountry_encoded': row['JobCountry_encoded'],
                     'UserID_encoded': row['UserID_encoded'],
                     'UserCity_encoded': row['UserCity_encoded'],
                     'UserState_encoded': row['UserState_encoded'],
                     'UserCountry_encoded': row['UserCountry_encoded'],
                     'UserDegree_encoded': row['UserDegree_encoded'],
                     'UserMajor_encoded': row['UserMajor_encoded']
                    }
            clicks.append(click)
        
        session_dict = {'SessionID': session_id,
                        'WindowID': rows['WindowID_x'].unique()[0],
                        'SessionSize': len(rows),
                        'SessionStart': rows['StartDate'].unique()[0],
                        'UserID_encoded': rows['UserID_encoded'].unique()[0],
                        'Clicks': clicks 
                       }
        sessions.append(session_dict)
    #return list(zip(map(lambda x: x['SessionID'], sessions), sessions))
    return sessions

In [10]:
sessions_train_30 =  prepare_session(train_df_new_30)
sessions_test_30 =  prepare_session(test_df_new_30)
print(len(sessions_train_30))
print(len(sessions_test_30))

153268
11759


# Step 6: Exporting sessions to JSON lines

In [11]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)     

def write_list_to_json(list_data, json_path):
    with open(json_path, 'w') as  f:
        json.dump(list_data, f, cls=NpEncoder)
    

In [12]:
write_list_to_json(sessions_train_30, "../data/cb12/sessions_json/train_14d_30_sessions")
write_list_to_json(sessions_test_30, "../data/cb12/sessions_json/test_14d_30_sessions")