# co-visitation matrix

In [1]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random
from collections import defaultdict

import pandas as pd
import numpy as np
import cudf

In [2]:
SEED = 42
random.seed(SEED)

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [4]:
CHUNK_N = 400
W = 3

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
weeks = [None, "week3", "week4"]

In [7]:
for week in weeks:

    print(week)
        
    # データ読み込み
    files = [
        "train_sessions_week1.parquet",
        "test_sessions_week1.parquet",
        "train_sessions_week2.parquet",
        "test_sessions_week2.parquet",
        "train_sessions_week3.parquet",
        "test_sessions_week3.parquet",
        "train_sessions_week4.parquet",
        "test_sessions_week4.parquet",
        "test_sessions.parquet"
    ]    

    if week is not None:
        files.remove(f"test_sessions_{week}.parquet")

    print(files)

    dfs = []
    for file in files:
        dfs.append(pd.read_parquet(PREP_DIR + file))
    sessions = pd.concat(dfs)

    sessions = sessions.drop(columns=["type"])
    sessions["ts"] = (sessions["ts"]/1000).astype("int32")

    # chunkのグループ分け
    sessions["chunk"] = sessions["session"] % CHUNK_N

    del dfs
    gc.collect()

    dfs_outer = []
    dfs_inner = []

    pair_df = pd.DataFrame(columns=["aid_x", "aid_y", "cnt"])
    for i, chunk_df in tqdm(sessions.groupby("chunk"), total=CHUNK_N):
        df = cudf.from_pandas(chunk_df)
        df = df.merge(df, on="session")
        df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)]
        df["wt"] = (W - 1) * (1 - (df.ts_x - df.ts_y).abs() / (24*60*60)) + 1
        df = df.sort_values("wt").drop_duplicates(subset=["session", "aid_x", "aid_y"], keep="last")
        df = df.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
        df = df.sort_values(["aid_x", "wt"], ascending=(True, False))
        df = df[df.groupby("aid_x").cumcount()<30]
        dfs_inner.append(df.to_pandas())

        if i % 100 == 99:
            if len(dfs_inner) > 0:
                pair_df_inner = pd.concat(dfs_inner)
                pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
                pair_df_inner = reduce_mem_usage(pair_df_inner)
                dfs_outer.append(pair_df_inner)
                dfs_inner = []
                del pair_df_inner
                gc.collect()

    if len(dfs_inner) > 0:
        pair_df_inner = pd.concat(dfs_inner)
        pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
        dfs_outer.append(pair_df_inner)
        del pair_df_inner
        gc.collect()

    del dfs_inner
    gc.collect()

    pair_df = pd.concat(dfs_outer)
    pair_df = pair_df.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
    pair_df = pair_df.sort_values(["aid_x", "wt"], ascending=(True, False))

    pair_df = reduce_mem_usage(pair_df)

    if week is not None:
        pair_df.to_parquet(PREP_DIR + f"co_visitation_matrix_time_weighted_{week}.parquet")
    else:
        pair_df.to_parquet(PREP_DIR + f"co_visitation_matrix_time_weighted.parquet")


    del pair_df
    gc.collect()

None
['train_sessions_week1.parquet', 'test_sessions_week1.parquet', 'train_sessions_week2.parquet', 'test_sessions_week2.parquet', 'train_sessions_week3.parquet', 'test_sessions_week3.parquet', 'train_sessions_week4.parquet', 'test_sessions_week4.parquet', 'test_sessions.parquet']


 25%|██▍       | 99/400 [01:25<03:36,  1.39it/s] 

Memory usage of dataframe is 5405.24 MB
Memory usage after optimization is: 2702.62 MB
Decreased by 50.0%


 50%|████▉     | 199/400 [04:40<02:18,  1.45it/s]  

Memory usage of dataframe is 5394.08 MB
Memory usage after optimization is: 2697.04 MB
Decreased by 50.0%


 75%|███████▍  | 299/400 [07:49<01:11,  1.42it/s]  

Memory usage of dataframe is 5385.55 MB
Memory usage after optimization is: 2692.77 MB
Decreased by 50.0%


100%|█████████▉| 399/400 [10:56<00:00,  1.49it/s]

Memory usage of dataframe is 5399.59 MB
Memory usage after optimization is: 2699.80 MB
Decreased by 50.0%


100%|██████████| 400/400 [12:52<00:00,  1.93s/it]


Memory usage of dataframe is 21286.00 MB
Memory usage after optimization is: 15204.29 MB
Decreased by 28.6%
week3
['train_sessions_week1.parquet', 'test_sessions_week1.parquet', 'train_sessions_week2.parquet', 'test_sessions_week2.parquet', 'train_sessions_week3.parquet', 'train_sessions_week4.parquet', 'test_sessions_week4.parquet', 'test_sessions.parquet']


 25%|██▍       | 99/400 [01:20<03:03,  1.64it/s] 

Memory usage of dataframe is 4821.61 MB


 25%|██▌       | 100/400 [03:13<2:51:50, 34.37s/it]

Memory usage after optimization is: 2410.81 MB
Decreased by 50.0%


 50%|████▉     | 199/400 [04:13<01:58,  1.70it/s]  

Memory usage of dataframe is 4806.73 MB


 50%|█████     | 200/400 [06:03<1:51:16, 33.38s/it]

Memory usage after optimization is: 2403.36 MB
Decreased by 50.0%


 75%|███████▍  | 299/400 [07:02<00:59,  1.70it/s]  

Memory usage of dataframe is 4800.27 MB


 75%|███████▌  | 300/400 [08:49<54:11, 32.51s/it]

Memory usage after optimization is: 2400.14 MB
Decreased by 50.0%


100%|█████████▉| 399/400 [09:48<00:00,  1.69it/s]

Memory usage of dataframe is 4817.16 MB


100%|██████████| 400/400 [11:39<00:00,  1.75s/it]

Memory usage after optimization is: 2408.58 MB
Decreased by 50.0%





Memory usage of dataframe is 19039.03 MB
Memory usage after optimization is: 13599.31 MB
Decreased by 28.6%
week4
['train_sessions_week1.parquet', 'test_sessions_week1.parquet', 'train_sessions_week2.parquet', 'test_sessions_week2.parquet', 'train_sessions_week3.parquet', 'test_sessions_week3.parquet', 'train_sessions_week4.parquet', 'test_sessions.parquet']


 25%|██▍       | 99/400 [01:28<02:58,  1.68it/s] 

Memory usage of dataframe is 4810.28 MB


 25%|██▌       | 100/400 [03:15<2:42:14, 32.45s/it]

Memory usage after optimization is: 2405.14 MB
Decreased by 50.0%


 50%|████▉     | 199/400 [04:15<02:00,  1.67it/s]  

Memory usage of dataframe is 4807.40 MB


 50%|█████     | 200/400 [06:08<1:53:51, 34.16s/it]

Memory usage after optimization is: 2403.70 MB
Decreased by 50.0%


 75%|███████▍  | 299/400 [07:07<00:59,  1.68it/s]  

Memory usage of dataframe is 4798.44 MB


 75%|███████▌  | 300/400 [08:56<54:44, 32.85s/it]

Memory usage after optimization is: 2399.22 MB
Decreased by 50.0%


100%|█████████▉| 399/400 [09:55<00:00,  1.67it/s]

Memory usage of dataframe is 4809.31 MB


100%|██████████| 400/400 [11:44<00:00,  1.76s/it]

Memory usage after optimization is: 2404.65 MB
Decreased by 50.0%





Memory usage of dataframe is 18994.51 MB
Memory usage after optimization is: 13567.51 MB
Decreased by 28.6%
