In [1]:
import os
import json
import random
import pickle
import argparse
import matplotlib
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import defaultdict, Counter

In [2]:
path =  "../data/"
dataset = "cb12/"

raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"

# Step 1: Load sessions from JSON file

In [3]:
def load_sessions_from_json(json_path):
    with open(json_path, 'r') as  f:
        sessions = json.load(f)
    return sessions

In [4]:
sessions_train_30 = load_sessions_from_json("../data/cb12/sessions_json/train_14d_30_sessions")
sessions_test_30 = load_sessions_from_json( "../data/cb12/sessions_json/test_14d_30_sessions")
print(len(sessions_train_30))
print(len(sessions_test_30))

153268
11759


# Step 2: Process session features

In [5]:
def process_session_clicks_features(list_sessions):
    sessions = []
    session_count = 0
    clicked_jobs_ids = []
    unique_clicked_jobs = set()
    
    for session in list_sessions:
        session_count += 1
        for click in session['Clicks']:
            #Copying click attributes as lists in the session
            for key in click:
                if key != "UserID_encoded" and key != "WindowID":
                    if key not in session:
                        session[key] = [click[key]]
                        
                    else:
                        session[key].append(click[key])
            
            clicked_jobs_ids.append(click['JobID_encoded'])
            unique_clicked_jobs.add(click['JobID_encoded'])
        
        #Removing clicks property, as its values were copied to individual list columns
        del session['Clicks']
        sessions.append(session)
    
    #Ensuring sessions are sorted by WindowID (time)
    sessions_df = pd.DataFrame(sessions).sort_values('WindowID')
    
    #Printing stats
    clicks_by_jobs_counter = dict(Counter(clicked_jobs_ids))
    clicks_by_jobs = np.array(list(clicks_by_jobs_counter.values()))
    total_clicks = np.sum(clicks_by_jobs)
    clicks_by_jobs_norm = clicks_by_jobs / total_clicks
    clicks_by_jobs_norm_mean = np.mean(clicks_by_jobs_norm)
    clicks_by_jobs_norm_median = np.median(clicks_by_jobs_norm)
    
    stats = {'session_count': session_count,
             'clicks': total_clicks,
             'clicks_by_session': total_clicks / session_count,
             'unique_jobs': len(unique_clicked_jobs),
             'clicks_by_job':float(total_clicks)/len(unique_clicked_jobs),
             'norm_pop_mean': clicks_by_jobs_norm_mean,
             'norm_pop_median': clicks_by_jobs_norm_median,
             #'gini_index': gini_index(clicks_by_jobs.astype(np.float32))
    }
    
    print("Stats :{}".format(stats))
    return sessions_df, stats, clicks_by_jobs_counter

In [6]:
sessions_train_df_30, train_stats_30, train_clicks_by_jobs_counter_30 = process_session_clicks_features(sessions_train_30)
sessions_test_df_30, test_stats_30, test_clicks_by_jobs_counter_30 = process_session_clicks_features(sessions_test_30)

Stats :{'session_count': 153268, 'clicks': 586434, 'clicks_by_session': 3.8261998590703867, 'unique_jobs': 207972, 'clicks_by_job': 2.81977381570596, 'norm_pop_mean': 4.808339584174792e-06, 'norm_pop_median': 1.7052217299815496e-06}
Stats :{'session_count': 11759, 'clicks': 52035, 'clicks_by_session': 4.425121183774131, 'unique_jobs': 13166, 'clicks_by_job': 3.9522254291356522, 'norm_pop_mean': 7.595321282090232e-05, 'norm_pop_median': 3.843566830018257e-05}


# Step 3: Export sessions to tf records

In [7]:
import sys
from tensorflow.python.lib.io import tf_record

In [8]:
def make_sequential_feature(values, vtype=int):
    if vtype == int:
        features = [tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) for value in values]
    elif vtype == float:
        features = [tf.train.Feature(float_list=tf.train.FloatList(value=[value])) for value in values]
    elif vtype == str:
        features = [tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()])) for value in values]
    return tf.train.FeatureList(feature=features)



def make_sequence_example(row):
    context_features = {        
        'SessionID': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['SessionID']])),                
        'SessionSize': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['SessionSize']])),
        'SessionStart': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['SessionStart']])),
        'UserID': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['UserID_encoded']])),        
    }
    
    context = tf.train.Features(feature=context_features)
    
    sequence_features = {
        'ApplicationDate': make_sequential_feature(row['ApplicationDate']),
        #Categorical features
        'Job_clicked': make_sequential_feature(row["JobID_encoded"]),
        'JobCity': make_sequential_feature(row["JobCity_encoded"]),
        'JobState': make_sequential_feature(row["JobState_encoded"]),
        'JobCountry': make_sequential_feature(row["JobCountry_encoded"]),
        'UserCity': make_sequential_feature(row["UserCity_encoded"]),
        'UserState': make_sequential_feature(row["UserState_encoded"]),
        'UserCountry': make_sequential_feature(row["UserCountry_encoded"]),
        'UserDegree': make_sequential_feature(row["UserDegree_encoded"]),
        'UserMajor': make_sequential_feature(row["UserMajor_encoded"]),
    }    

    sequence_feature_lists = tf.train.FeatureLists(feature_list=sequence_features)
    
    return tf.train.SequenceExample(feature_lists=sequence_feature_lists, context=context)    


def save_rows_to_tf_record_file(rows, make_sequence_example_fn, export_filename):
    tf_record_options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP)

    tf_writer = tf_record.TFRecordWriter(export_filename, options=tf_record_options)
    try:
        for row in rows:
            seq_example = make_sequence_example_fn(row)
            tf_writer.write(seq_example.SerializeToString())
    finally:
        tf_writer.close()
        sys.stdout.flush()    

        
def export_sessions_to_tf_records(sessions_df, output_path):        
    save_rows_to_tf_record_file(map(lambda x: x[1], sessions_df.iterrows()), make_sequence_example, export_filename=output_path)

In [10]:
export_sessions_to_tf_records(sessions_train_df_30, output_path="../data/cb12/sessions_tf/train_14d_30_sessions")
export_sessions_to_tf_records(sessions_test_df_30, output_path="../data/cb12/sessions_tf/test_14d_30_sessions")
sessions_test_df_30.shape

(11759, 15)