# pkl2parquet
pklファイルをparquetファイルに変換

In [11]:
import os
import sys
import traceback
import gc
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import inspect

import pandas as pd
import numpy as np
import polars as pl
import itertools
import cudf
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import line_notify
import my_logger
from noglobal import noglobal

# 設定

In [12]:
@dataclass
class Cfg:
    loglevel = "INFO"
    exp_name = ""
    seed = 42
    k = 20
    cand_n = 15
    negative_sample = 1
    train_chunk_n = 1
    test_chunk_n = 2
    type2id = {"clicks":0, "carts":1, "orders":2}
    id2type = {0:"clicks", 1:"carts", 2:"orders"}
    train_weeks = ["week3"]
    valid_week = "week4"
    valid_session_n = 100_000
    input_dir = os.getenv('INPUT_DIR')
    output_dir = os.getenv('OUTPUT_DIR')
    prep_dir = os.getenv("PREP_DIR")

    clicks_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 5.485903737168179, 'lambda_l2': 0.005594683492536064, 'num_leaves': 79, 'feature_fraction': 0.552, 'bagging_fraction': 0.9295272232672004, 'bagging_freq': 2, 'min_child_samples': 10}
    carts_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 8.709050252544463, 'lambda_l2': 0.06935262036337767, 'num_leaves': 252, 'feature_fraction': 0.4, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 5}
    orders_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 9.356310279757256, 'lambda_l2': 1.3120983078968551e-08, 'num_leaves': 174, 'feature_fraction': 0.5, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20}

cfg = Cfg()
random.seed(cfg.seed)

In [13]:
def pkl2parquet(pkl_file_path, parquet_file_path):
    print(pkl_file_path, "start")
    df = pd.read_pickle(pkl_file_path)
    df.to_parquet(parquet_file_path)
    print(parquet_file_path, "end")

In [14]:
pkl_files = [
    "train_sessions",
    "test_sessions"
]

weeks = [
    None,
    "week1",
    "week2",
    "week3",
    "week4"
]

for pkl_file in pkl_files:
    for week in weeks:
        if week is None:
            pkl_file_path = cfg.prep_dir + f"{pkl_file}.pkl"
            parquet_file_path = cfg.prep_dir + f"{pkl_file}.parquet"
        else:
            pkl_file_path = cfg.prep_dir + f"{pkl_file}_{week}.pkl"
            parquet_file_path = cfg.prep_dir + f"{pkl_file}_{week}.parquet"
        pkl2parquet(pkl_file_path, pkl_file_path)

/mnt/otto-recommender-system/prep/train_sessions.pkl start
/mnt/otto-recommender-system/prep/train_sessions.pkl end
/mnt/otto-recommender-system/prep/train_sessions_week1.pkl start
/mnt/otto-recommender-system/prep/train_sessions_week1.pkl end
/mnt/otto-recommender-system/prep/train_sessions_week2.pkl start
/mnt/otto-recommender-system/prep/train_sessions_week2.pkl end
/mnt/otto-recommender-system/prep/train_sessions_week3.pkl start
/mnt/otto-recommender-system/prep/train_sessions_week3.pkl end
/mnt/otto-recommender-system/prep/train_sessions_week4.pkl start
/mnt/otto-recommender-system/prep/train_sessions_week4.pkl end
/mnt/otto-recommender-system/prep/test_sessions.pkl start
/mnt/otto-recommender-system/prep/test_sessions.pkl end
/mnt/otto-recommender-system/prep/test_sessions_week1.pkl start
/mnt/otto-recommender-system/prep/test_sessions_week1.pkl end
/mnt/otto-recommender-system/prep/test_sessions_week2.pkl start
/mnt/otto-recommender-system/prep/test_sessions_week2.pkl end
/mnt/o

In [21]:
pkl_files = [
    #"co_visitation_matrix",
    #"co_visitation_matrix_time_weighted",
    #"co_visitation_matrix_type_weighted",
    #"co_visitation_matrix_clicks2carts",
    #"co_visitation_matrix_clicks2orders",
    #"co_visitation_matrix_1w",
    #"co_visitation_matrix_time_weighted_1w",
    "co_visitation_matrix_type_weighted_1w",
    "co_visitation_matrix_clicks2carts_1w",
    "co_visitation_matrix_clicks2orders_1w",
    ]

weeks = [
    None,
    "week3",
    "week4"
]

for pkl_file in pkl_files:
    for week in weeks:
        if week is None:
            pkl_file_path = cfg.prep_dir + f"{pkl_file}.pkl"
            parquet_file_path = cfg.prep_dir + f"{pkl_file}.parquet"
        else:
            pkl_file_path = cfg.prep_dir + f"{pkl_file}_{week}.pkl"
            parquet_file_path = cfg.prep_dir + f"{pkl_file}_{week}.parquet"
        pkl2parquet(pkl_file_path, pkl_file_path)

/mnt/otto-recommender-system/prep/co_visitation_matrix_type_weighted_1w.pkl start
/mnt/otto-recommender-system/prep/co_visitation_matrix_type_weighted_1w.pkl end
/mnt/otto-recommender-system/prep/co_visitation_matrix_type_weighted_1w_week3.pkl start
/mnt/otto-recommender-system/prep/co_visitation_matrix_type_weighted_1w_week3.pkl end
/mnt/otto-recommender-system/prep/co_visitation_matrix_type_weighted_1w_week4.pkl start
/mnt/otto-recommender-system/prep/co_visitation_matrix_type_weighted_1w_week4.pkl end
/mnt/otto-recommender-system/prep/co_visitation_matrix_clicks2carts_1w.pkl start
/mnt/otto-recommender-system/prep/co_visitation_matrix_clicks2carts_1w.pkl end
/mnt/otto-recommender-system/prep/co_visitation_matrix_clicks2carts_1w_week3.pkl start
/mnt/otto-recommender-system/prep/co_visitation_matrix_clicks2carts_1w_week3.pkl end
/mnt/otto-recommender-system/prep/co_visitation_matrix_clicks2carts_1w_week4.pkl start
/mnt/otto-recommender-system/prep/co_visitation_matrix_clicks2carts_1w_

In [None]:
pkl_files = [
    "word2vec_similar",
    "w2v_vector_n5",
    "w2v_vector_n50",
]

for pkl_file in pkl_files:
    if week is None:
        pkl_file_path = cfg.prep_dir + f"{pkl_file}.pkl"
        parquet_file_path = cfg.prep_dir + f"{pkl_file}.parquet"
    else:
        pkl_file_path = cfg.prep_dir + f"{pkl_file}_{week}.pkl"
        parquet_file_path = cfg.prep_dir + f"{pkl_file}_{week}.parquet"
    pkl2parquet(pkl_file_path, pkl_file_path)

In [22]:
pkl_files = [
    "labels"
]

weeks = [
    "week1",
    "week2",
    "week3",
    "week4"
]

for pkl_file in pkl_files:
    for week in weeks:
        if week is None:
            pkl_file_path = cfg.prep_dir + f"{pkl_file}.pkl"
            parquet_file_path = cfg.prep_dir + f"{pkl_file}.parquet"
        else:
            pkl_file_path = cfg.prep_dir + f"{pkl_file}_{week}.pkl"
            parquet_file_path = cfg.prep_dir + f"{pkl_file}_{week}.parquet"
        pkl2parquet(pkl_file_path, parquet_file_path)

/mnt/otto-recommender-system/prep/labels_week1.pkl start
/mnt/otto-recommender-system/prep/labels_week1.parquet end
/mnt/otto-recommender-system/prep/labels_week2.pkl start
/mnt/otto-recommender-system/prep/labels_week2.parquet end
/mnt/otto-recommender-system/prep/labels_week3.pkl start
/mnt/otto-recommender-system/prep/labels_week3.parquet end
/mnt/otto-recommender-system/prep/labels_week4.pkl start
/mnt/otto-recommender-system/prep/labels_week4.parquet end
