# co-visitation matrix

In [1]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random
from collections import defaultdict

import pandas as pd
import numpy as np
import cudf

In [2]:
SEED = 42
random.seed(SEED)

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [4]:
CHUNK_N = 1000
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
weeks = [None, "week3", "week4"]

In [7]:
for week in weeks:

    print(week)
        
    # データ読み込み
    files = [
        "train_sessions_week1.pkl",
        "test_sessions_week1.pkl",
        "train_sessions_week2.pkl",
        "test_sessions_week2.pkl",
        "train_sessions_week3.pkl",
        "test_sessions_week3.pkl",
        "train_sessions_week4.pkl",
        "test_sessions_week4.pkl",
        "test_sessions.pkl"
    ]


    if week is not None:
        files.remove(f"test_sessions_{week}.pkl")
    print(files)

    dfs = []
    for file in files:
        dfs.append(pd.read_pickle(PREP_DIR + file))
    sessions = pd.concat(dfs)

    sessions["ts"] = (sessions["ts"]/1000).astype("int32")

    # chunkのグループ分け
    sessions["chunk"] = sessions["session"] % CHUNK_N
    
    sessions["type"] = sessions["type"].map(type_labels)
    sessions = sessions[sessions["type"].isin([0,1])].copy()

    del dfs
    gc.collect()

    dfs_outer = []
    dfs_inner = []

    pair_df = pd.DataFrame(columns=["aid_x", "aid_y", "cnt"])
    for i, chunk_df in tqdm(sessions.groupby("chunk"), total=CHUNK_N):
        df = cudf.from_pandas(chunk_df)
        df = df.merge(df, on="session")
        df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y) & (df.type_x == 0) & (df.type_y == 1)]
        df = df.groupby(["aid_x", "aid_y"])["session"].count().reset_index()
        df = df.rename(columns = {"session": "cnt"})
        df = df[df["cnt"]>2]
        dfs_inner.append(df.to_pandas())

        if i % 100 == 99:
            if len(dfs_inner) > 0:
                pair_df_inner = pd.concat(dfs_inner)
                pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["cnt"].sum().reset_index()
                dfs_outer.append(pair_df_inner)
                dfs_inner = []
                del pair_df_inner
                gc.collect()

    if len(dfs_inner) > 0:
        pair_df_inner = pd.concat(dfs_inner)
        pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["cnt"].sum().reset_index()
        dfs_outer.append(pair_df_inner)
        del pair_df_inner
        gc.collect()

    del dfs_inner
    gc.collect()

    pair_df = pd.concat(dfs_outer)
    pair_df = pair_df.groupby(["aid_x", "aid_y"])["cnt"].sum().reset_index()
    pair_df = pair_df.sort_values(["aid_x", "cnt"], ascending=(True, False))

    pair_df = reduce_mem_usage(pair_df)

    if week is not None:
        pair_df.to_pickle(PREP_DIR + f"co_visitation_matrix_clicks2carts_{week}.pkl")
    else:
        pair_df.to_pickle(PREP_DIR + f"co_visitation_matrix_clicks2carts.pkl")


    del pair_df
    gc.collect()

None
['train_sessions_week1.pkl', 'test_sessions_week1.pkl', 'train_sessions_week2.pkl', 'test_sessions_week2.pkl', 'train_sessions_week3.pkl', 'test_sessions_week3.pkl', 'train_sessions_week4.pkl', 'test_sessions_week4.pkl', 'test_sessions.pkl']


100%|██████████| 1000/1000 [02:35<00:00,  6.44it/s]


Memory usage of dataframe is 702.29 MB
Memory usage after optimization is: 501.63 MB
Decreased by 28.6%
week3
['train_sessions_week1.pkl', 'test_sessions_week1.pkl', 'train_sessions_week2.pkl', 'test_sessions_week2.pkl', 'train_sessions_week3.pkl', 'train_sessions_week4.pkl', 'test_sessions_week4.pkl', 'test_sessions.pkl']


100%|██████████| 1000/1000 [02:14<00:00,  7.42it/s]


Memory usage of dataframe is 590.12 MB
Memory usage after optimization is: 421.51 MB
Decreased by 28.6%
week4
['train_sessions_week1.pkl', 'test_sessions_week1.pkl', 'train_sessions_week2.pkl', 'test_sessions_week2.pkl', 'train_sessions_week3.pkl', 'test_sessions_week3.pkl', 'train_sessions_week4.pkl', 'test_sessions.pkl']


100%|██████████| 1000/1000 [02:16<00:00,  7.32it/s]


Memory usage of dataframe is 592.66 MB
Memory usage after optimization is: 423.33 MB
Decreased by 28.6%
