# co-visitation matrix

In [1]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random
from collections import defaultdict

import pandas as pd
import numpy as np
import cudf

In [2]:
SEED = 42
random.seed(SEED)

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [4]:
CHUNK_N = 400
type_weight = {"clicks":1, "carts":6, "orders":3}

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
weeks = [None, "week3", "week4"]

In [7]:
for week in weeks:
    if week is not None:
        file_ = f"train_sessions_{week}.pkl"
    else:
        file_ = "test_sessions.pkl"
    print(week, file_)
    sessions = pd.read_pickle(PREP_DIR + file_)

    sessions["type"] = sessions["type"].map(type_weight)
    sessions["ts"] = (sessions["ts"]/1000).astype("int32")

    # chunkのグループ分け
    sessions["chunk"] = sessions["session"] % CHUNK_N

    dfs_outer = []
    dfs_inner = []

    pair_df = pd.DataFrame(columns=["aid_x", "aid_y", "cnt"])
    for i, chunk_df in tqdm(sessions.groupby("chunk"), total=CHUNK_N):
        df = cudf.from_pandas(chunk_df)
        df = df.merge(df, on="session")
        df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)]
        df = df.rename(columns={"type_y": "wt"})
        df = df.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
        df = df.sort_values(["aid_x", "wt"], ascending=(True, False))
        df = df[df.groupby("aid_x").cumcount()<30]
        dfs_inner.append(df.to_pandas())

        if i % 100 == 99:
            if len(dfs_inner) > 0:
                pair_df_inner = pd.concat(dfs_inner)
                pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
                pair_df_inner = reduce_mem_usage(pair_df_inner)
                dfs_outer.append(pair_df_inner)
                dfs_inner = []
                del pair_df_inner
                gc.collect()

    if len(dfs_inner) > 0:
        pair_df_inner = pd.concat(dfs_inner)
        pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
        dfs_outer.append(pair_df_inner)
        del pair_df_inner
        gc.collect()

    del dfs_inner
    gc.collect()

    pair_df = pd.concat(dfs_outer)
    pair_df = pair_df.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
    pair_df = pair_df.sort_values(["aid_x", "wt"], ascending=(True, False))

    pair_df = reduce_mem_usage(pair_df)

    if week is not None:
        pair_df.to_pickle(PREP_DIR + f"co_visitation_matrix_type_weighted_1w_{week}.pkl")
    else:
        pair_df.to_pickle(PREP_DIR + f"co_visitation_matrix_type_weighted_1w.pkl")


    del pair_df
    gc.collect()

None test_sessions.pkl


 25%|██▍       | 99/400 [00:07<00:23, 12.97it/s]

Memory usage of dataframe is 197.50 MB
Memory usage after optimization is: 82.29 MB
Decreased by 58.3%


 50%|█████     | 200/400 [00:20<01:36,  2.07it/s]

Memory usage of dataframe is 197.39 MB
Memory usage after optimization is: 82.25 MB
Decreased by 58.3%


 75%|███████▌  | 300/400 [00:29<00:49,  2.03it/s]

Memory usage of dataframe is 197.19 MB
Memory usage after optimization is: 82.16 MB
Decreased by 58.3%


100%|██████████| 400/400 [00:39<00:00, 10.13it/s]

Memory usage of dataframe is 196.80 MB
Memory usage after optimization is: 82.00 MB
Decreased by 58.3%





Memory usage of dataframe is 784.65 MB
Memory usage after optimization is: 543.22 MB
Decreased by 30.8%
week3 train_sessions_week3.pkl


 25%|██▍       | 99/400 [00:13<00:31,  9.66it/s]

Memory usage of dataframe is 854.90 MB


 25%|██▌       | 101/400 [00:27<13:09,  2.64s/it]

Memory usage after optimization is: 356.21 MB
Decreased by 58.3%


 50%|████▉     | 199/400 [00:37<00:22,  9.13it/s]

Memory usage of dataframe is 853.99 MB


 50%|█████     | 200/400 [00:51<11:19,  3.40s/it]

Memory usage after optimization is: 355.83 MB
Decreased by 58.3%


 75%|███████▍  | 299/400 [01:01<00:09, 10.21it/s]

Memory usage of dataframe is 849.02 MB
Memory usage after optimization is: 353.76 MB
Decreased by 58.3%


100%|█████████▉| 399/400 [01:26<00:00, 10.02it/s]

Memory usage of dataframe is 858.02 MB


100%|██████████| 400/400 [01:39<00:00,  4.02it/s]

Memory usage after optimization is: 357.51 MB
Decreased by 58.3%





Memory usage of dataframe is 3310.45 MB
Memory usage after optimization is: 2291.85 MB
Decreased by 30.8%
week4 train_sessions_week4.pkl


 24%|██▍       | 98/400 [00:12<00:34,  8.69it/s]

Memory usage of dataframe is 834.40 MB


 25%|██▌       | 100/400 [00:26<15:41,  3.14s/it]

Memory usage after optimization is: 417.20 MB
Decreased by 50.0%


 50%|████▉     | 199/400 [00:37<00:21,  9.24it/s]

Memory usage of dataframe is 833.21 MB


 50%|█████     | 200/400 [00:50<11:50,  3.55s/it]

Memory usage after optimization is: 416.60 MB
Decreased by 50.0%


 75%|███████▍  | 299/400 [01:01<00:10,  9.33it/s]

Memory usage of dataframe is 832.50 MB


 75%|███████▌  | 300/400 [01:15<05:34,  3.35s/it]

Memory usage after optimization is: 416.25 MB
Decreased by 50.0%


100%|█████████▉| 399/400 [01:26<00:00,  9.03it/s]

Memory usage of dataframe is 836.40 MB


100%|██████████| 400/400 [01:39<00:00,  4.01it/s]

Memory usage after optimization is: 418.20 MB
Decreased by 50.0%





Memory usage of dataframe is 3502.27 MB
Memory usage after optimization is: 2501.62 MB
Decreased by 28.6%
