# co-visitation matrix

In [1]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random
from collections import defaultdict

import pandas as pd
import numpy as np
import cudf

In [2]:
SEED = 42
random.seed(SEED)

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [4]:
CHUNK_N = 400
W = 3

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
weeks = [None, "week3", "week4"]

In [7]:
for week in weeks:

    print(week)
        
    # データ読み込み
    files = [
        "train_sessions_week1.pkl",
        "test_sessions_week1.pkl",
        "train_sessions_week2.pkl",
        "test_sessions_week2.pkl",
        "train_sessions_week3.pkl",
        "test_sessions_week3.pkl",
        "train_sessions_week4.pkl",
        "test_sessions_week4.pkl",
        "test_sessions.pkl"
    ]    

    if week is not None:
        files.remove(f"test_sessions_{week}.pkl")

    print(files)

    dfs = []
    for file in files:
        dfs.append(pd.read_pickle(PREP_DIR + file))
    sessions = pd.concat(dfs)

    sessions = sessions.drop(columns=["type"])
    sessions["ts"] = (sessions["ts"]/1000).astype("int32")

    # chunkのグループ分け
    sessions["chunk"] = sessions["session"] % CHUNK_N

    del dfs
    gc.collect()

    dfs_outer = []
    dfs_inner = []

    pair_df = pd.DataFrame(columns=["aid_x", "aid_y", "cnt"])
    for i, chunk_df in tqdm(sessions.groupby("chunk"), total=CHUNK_N):
        df = cudf.from_pandas(chunk_df)
        df = df.merge(df, on="session")
        df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)]
        df["wt"] = (W - 1) * (1 - (df.ts_x - df.ts_y).abs() / (24*60*60)) + 1
        df = df.sort_values("wt").drop_duplicates(subset=["session", "aid_x", "aid_y"], keep="last")
        df = df.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
        df = df.sort_values(["aid_x", "wt"], ascending=(True, False))
        df = df[df.groupby("aid_x").cumcount()<30]
        dfs_inner.append(df.to_pandas())

        if i % 100 == 99:
            if len(dfs_inner) > 0:
                pair_df_inner = pd.concat(dfs_inner)
                pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
                pair_df_inner = reduce_mem_usage(pair_df_inner)
                dfs_outer.append(pair_df_inner)
                dfs_inner = []
                del pair_df_inner
                gc.collect()

    if len(dfs_inner) > 0:
        pair_df_inner = pd.concat(dfs_inner)
        pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
        dfs_outer.append(pair_df_inner)
        del pair_df_inner
        gc.collect()

    del dfs_inner
    gc.collect()

    pair_df = pd.concat(dfs_outer)
    pair_df = pair_df.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
    pair_df = pair_df.sort_values(["aid_x", "wt"], ascending=(True, False))

    pair_df = reduce_mem_usage(pair_df)

    if week is not None:
        pair_df.to_pickle(PREP_DIR + f"co_visitation_matrix_time_weighted_{week}.pkl")
    else:
        pair_df.to_pickle(PREP_DIR + f"co_visitation_matrix_time_weighted.pkl")


    del pair_df
    gc.collect()

None
['train_sessions_week1.pkl', 'test_sessions_week1.pkl', 'train_sessions_week2.pkl', 'test_sessions_week2.pkl', 'train_sessions_week3.pkl', 'test_sessions_week3.pkl', 'train_sessions_week4.pkl', 'test_sessions_week4.pkl', 'test_sessions.pkl']


 25%|██▍       | 99/400 [01:28<03:42,  1.36it/s] 

Memory usage of dataframe is 5405.24 MB


 25%|██▌       | 100/400 [03:24<2:56:14, 35.25s/it]

Memory usage after optimization is: 2252.18 MB
Decreased by 58.3%


 50%|████▉     | 199/400 [04:36<02:26,  1.37it/s]  

Memory usage of dataframe is 5394.08 MB


 50%|█████     | 200/400 [06:35<2:00:00, 36.00s/it]

Memory usage after optimization is: 2247.53 MB
Decreased by 58.3%


 75%|███████▍  | 299/400 [07:46<01:12,  1.40it/s]  

Memory usage of dataframe is 5385.55 MB


 75%|███████▌  | 300/400 [09:42<59:00, 35.40s/it]

Memory usage after optimization is: 2243.98 MB
Decreased by 58.3%


100%|█████████▉| 399/400 [10:53<00:00,  1.39it/s]

Memory usage of dataframe is 5399.59 MB
Memory usage after optimization is: 2249.83 MB
Decreased by 58.3%


100%|██████████| 400/400 [12:51<00:00,  1.93s/it]


Memory usage of dataframe is 19765.57 MB
Memory usage after optimization is: 13683.86 MB
Decreased by 30.8%
week3
['train_sessions_week1.pkl', 'test_sessions_week1.pkl', 'train_sessions_week2.pkl', 'test_sessions_week2.pkl', 'train_sessions_week3.pkl', 'train_sessions_week4.pkl', 'test_sessions_week4.pkl', 'test_sessions.pkl']


 25%|██▍       | 99/400 [01:17<03:02,  1.65it/s] 

Memory usage of dataframe is 4821.61 MB


 25%|██▌       | 100/400 [03:04<2:41:39, 32.33s/it]

Memory usage after optimization is: 2009.00 MB
Decreased by 58.3%


 50%|████▉     | 199/400 [04:05<02:03,  1.63it/s]  

Memory usage of dataframe is 4806.73 MB


 50%|█████     | 200/400 [05:50<1:46:39, 32.00s/it]

Memory usage after optimization is: 2002.80 MB
Decreased by 58.3%


 75%|███████▍  | 299/400 [06:52<01:01,  1.64it/s]  

Memory usage of dataframe is 4800.27 MB


 75%|███████▌  | 300/400 [08:37<53:33, 32.13s/it]

Memory usage after optimization is: 2000.11 MB
Decreased by 58.3%


100%|█████████▉| 399/400 [09:38<00:00,  1.61it/s]

Memory usage of dataframe is 4817.16 MB


100%|██████████| 400/400 [11:22<00:00,  1.71s/it]

Memory usage after optimization is: 2007.15 MB
Decreased by 58.3%





Memory usage of dataframe is 17679.10 MB
Memory usage after optimization is: 12239.38 MB
Decreased by 30.8%
week4
['train_sessions_week1.pkl', 'test_sessions_week1.pkl', 'train_sessions_week2.pkl', 'test_sessions_week2.pkl', 'train_sessions_week3.pkl', 'test_sessions_week3.pkl', 'train_sessions_week4.pkl', 'test_sessions.pkl']


 25%|██▍       | 99/400 [01:18<03:09,  1.58it/s] 

Memory usage of dataframe is 4810.28 MB


 25%|██▌       | 100/400 [03:06<2:45:03, 33.01s/it]

Memory usage after optimization is: 2004.28 MB
Decreased by 58.3%


 50%|████▉     | 199/400 [04:09<02:00,  1.67it/s]  

Memory usage of dataframe is 4807.40 MB


 50%|█████     | 200/400 [05:56<1:49:14, 32.77s/it]

Memory usage after optimization is: 2003.08 MB
Decreased by 58.3%


 75%|███████▍  | 299/400 [06:58<01:02,  1.62it/s]  

Memory usage of dataframe is 4798.44 MB


 75%|███████▌  | 300/400 [08:45<54:20, 32.60s/it]

Memory usage after optimization is: 1999.35 MB
Decreased by 58.3%


100%|█████████▉| 399/400 [09:47<00:00,  1.65it/s]

Memory usage of dataframe is 4809.31 MB


100%|██████████| 400/400 [11:33<00:00,  1.73s/it]

Memory usage after optimization is: 2003.88 MB
Decreased by 58.3%





Memory usage of dataframe is 17637.76 MB
Memory usage after optimization is: 12210.76 MB
Decreased by 30.8%
