# co-visitation matrix

In [17]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random
from collections import defaultdict

import pandas as pd
import numpy as np
import cudf

In [18]:
SEED = 42
random.seed(SEED)

In [19]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [20]:
CHUNK_N = 400
W = 3

In [21]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [22]:
weeks = [None, "week3", "week4"]

In [23]:
pair_df.dtypes

aid_x      int32
aid_y      int32
wt       float16
dtype: object

In [24]:
for week in weeks:
    if week is not None:
        file_ = f"train_sessions_{week}.parquet"
    else:
        file_ = "test_sessions.parquet"
    print(week, file_)
    sessions = pd.read_parquet(PREP_DIR + file_)

    sessions = sessions.drop(columns=["type"])
    sessions["ts"] = (sessions["ts"]/1000).astype("int32")

    # chunkのグループ分け
    sessions["chunk"] = sessions["session"] % CHUNK_N

    dfs_outer = []
    dfs_inner = []

    pair_df = pd.DataFrame(columns=["aid_x", "aid_y", "cnt"])
    for i, chunk_df in tqdm(sessions.groupby("chunk"), total=CHUNK_N):
        df = cudf.from_pandas(chunk_df)
        df = df.merge(df, on="session")
        df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)]
        df["wt"] = (W - 1) * (1 - (df.ts_x - df.ts_y).abs() / (24*60*60)) + 1
        df = df.sort_values("wt").drop_duplicates(subset=["session", "aid_x", "aid_y"], keep="last")
        df = df.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
        df = df.sort_values(["aid_x", "wt"], ascending=(True, False))
        df = df[df.groupby("aid_x").cumcount()<30]
        dfs_inner.append(df.to_pandas())

        if i % 100 == 99:
            if len(dfs_inner) > 0:
                pair_df_inner = pd.concat(dfs_inner)
                pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
                pair_df_inner = reduce_mem_usage(pair_df_inner)
                dfs_outer.append(pair_df_inner)
                dfs_inner = []
                del pair_df_inner
                gc.collect()

    if len(dfs_inner) > 0:
        pair_df_inner = pd.concat(dfs_inner)
        pair_df_inner = pair_df_inner.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
        dfs_outer.append(pair_df_inner)
        del pair_df_inner
        gc.collect()

    del dfs_inner
    gc.collect()

    pair_df = pd.concat(dfs_outer)
    pair_df = pair_df.groupby(["aid_x", "aid_y"])["wt"].sum().reset_index()
    pair_df = pair_df.sort_values(["aid_x", "wt"], ascending=(True, False))

    pair_df = reduce_mem_usage(pair_df)

    if week is not None:
        pair_df.to_parquet(PREP_DIR + f"co_visitation_matrix_time_weighted_1w_{week}.parquet")
    else:
        pair_df.to_parquet(PREP_DIR + f"co_visitation_matrix_time_weighted_1w.parquet")


    del pair_df
    gc.collect()

None test_sessions.parquet


 24%|██▍       | 98/400 [00:09<00:29, 10.41it/s]

Memory usage of dataframe is 196.45 MB
Memory usage after optimization is: 98.23 MB
Decreased by 50.0%


 50%|████▉     | 199/400 [00:21<00:19, 10.27it/s]

Memory usage of dataframe is 196.06 MB
Memory usage after optimization is: 98.03 MB
Decreased by 50.0%


 74%|███████▍  | 298/400 [00:32<00:09, 11.27it/s]

Memory usage of dataframe is 196.05 MB
Memory usage after optimization is: 98.03 MB
Decreased by 50.0%


100%|█████████▉| 399/400 [00:44<00:00, 13.40it/s]

Memory usage of dataframe is 195.47 MB
Memory usage after optimization is: 97.74 MB
Decreased by 50.0%


100%|██████████| 400/400 [00:47<00:00,  8.41it/s]


Memory usage of dataframe is 835.77 MB
Memory usage after optimization is: 596.98 MB
Decreased by 28.6%
week3 train_sessions_week3.parquet


 25%|██▍       | 99/400 [00:15<00:40,  7.35it/s]

Memory usage of dataframe is 843.18 MB


 25%|██▌       | 100/400 [00:28<19:59,  4.00s/it]

Memory usage after optimization is: 421.59 MB
Decreased by 50.0%


 50%|████▉     | 199/400 [00:42<00:30,  6.59it/s]

Memory usage of dataframe is 841.36 MB


 50%|█████     | 200/400 [00:55<13:15,  3.98s/it]

Memory usage after optimization is: 420.68 MB
Decreased by 50.0%


 75%|███████▍  | 299/400 [01:08<00:12,  8.22it/s]

Memory usage of dataframe is 836.85 MB


 75%|███████▌  | 300/400 [01:21<06:38,  3.99s/it]

Memory usage after optimization is: 418.42 MB
Decreased by 50.0%


100%|█████████▉| 399/400 [01:35<00:00,  8.13it/s]

Memory usage of dataframe is 845.30 MB


100%|██████████| 400/400 [01:48<00:00,  3.68it/s]

Memory usage after optimization is: 422.65 MB
Decreased by 50.0%





Memory usage of dataframe is 3486.76 MB
Memory usage after optimization is: 2490.54 MB
Decreased by 28.6%
week4 train_sessions_week4.parquet


 25%|██▍       | 99/400 [00:15<00:40,  7.43it/s]

Memory usage of dataframe is 823.08 MB


 25%|██▌       | 100/400 [00:27<19:17,  3.86s/it]

Memory usage after optimization is: 411.54 MB
Decreased by 50.0%


 50%|████▉     | 199/400 [00:41<00:26,  7.63it/s]

Memory usage of dataframe is 821.83 MB


 50%|█████     | 200/400 [00:53<12:57,  3.89s/it]

Memory usage after optimization is: 410.92 MB
Decreased by 50.0%


 75%|███████▍  | 299/400 [01:07<00:13,  7.34it/s]

Memory usage of dataframe is 820.84 MB


 75%|███████▌  | 300/400 [01:19<06:22,  3.82s/it]

Memory usage after optimization is: 410.42 MB
Decreased by 50.0%


100%|█████████▉| 399/400 [01:33<00:00,  7.25it/s]

Memory usage of dataframe is 825.62 MB


100%|██████████| 400/400 [01:45<00:00,  3.78it/s]

Memory usage after optimization is: 412.81 MB
Decreased by 50.0%





Memory usage of dataframe is 3428.70 MB
Memory usage after optimization is: 2449.07 MB
Decreased by 28.6%
