In [1]:
import os
import sys
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing

import pandas as pd

In [2]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [3]:
def _chunk_process(chunk):
    event_dict = {
        "session": [],
        "aid": [],
        "ts": [],
        "type": []
    }
    for session, events in zip(chunk["session"].tolist(), chunk["events"].tolist()):
        for event in events:
            event_dict["session"].append(session)
            event_dict["aid"].append(event["aid"])
            event_dict["ts"].append(event["ts"])
            event_dict["type"].append(event["type"])
    chunk_session = pd.DataFrame(event_dict)
    return chunk_session

def jsonl2df(filepath, chunksize=100_000):
    line_count = int(subprocess.check_output(['wc', '-l', filepath]).decode().split(' ')[0])
    itr_n = -(-line_count // chunksize)

    sessions = pd.DataFrame()
    chunks = pd.read_json(filepath, lines=True, chunksize=chunksize)
    processes = multiprocessing.cpu_count()
    
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_chunk_process, chunks)
        dfs = list(tqdm(dfs, total=itr_n))
        
    sessions = pd.concat(dfs)
    sessions = sessions.sort_values(["session", "ts"])
    sessions = sessions.reset_index(drop=True)
    sessions["session"] = sessions["session"].astype(str)
    sessions["session"] = sessions["aid"].astype(str)
    return sessions

In [4]:
train_sessions = jsonl2df(INPUT_DIR + "train.jsonl")
train_sessions.to_pickle(PREP_DIR + "train_sessions.pkl")

100%|██████████| 129/129 [14:33<00:00,  6.77s/it]


In [5]:
test_sessions = jsonl2df(INPUT_DIR + "test.jsonl")
test_sessions.to_pickle(PREP_DIR + "test_sessions.pkl")

100%|██████████| 17/17 [00:46<00:00,  2.75s/it]
