Example user features

- how many items has user already clicked
- how many items has user already ordered
- what is average hour that user clicks
- what is average hour that user orders
- how many real sessions does user have (real session define by time gap between activity)
- what is average number of items in each user real session
- what is last day of week user made activity (i.e. monday, tuesday)
- what is first day of week user made activity
- what is average time between clicks

Example item features

- has this item already been clicked by user
- has this item already been added to cart by user
- if already clicked, what is its relative order? 1 means last clicked, 2 means second to last clicked etc
- has user clicked this item multiple times already? how many
- how many items (that user has already clicked) have recommended this item with their co-visitation matrix
- when was date that this item was first seen in train
- how many times what this item clicked in train
- what is the average hour of day that this item is clicked
- what is the average hour of day that this item is ordered
- how popular is this item on monday (i.e. what percentage of monday clicks are this item)
- how popular is this item on tuesday
- what is the most common day of week this item is clicked
- count up all unique items that were clicked immediately before and after. How many unique items have been clicked immediately before and after. (For example, maybe item only has 10 unique items that get clicked before and after. Whereas another item has 1000 unique items clicked before and after)
- what percentage of users click this item more than once
- has this item ever been bought in train data

In [18]:
from datetime import timedelta
from os.path import join

import pandas as pd

In [19]:
# 出力するディレクトリのパスを指定
OUTPUT_DIR = "."

In [20]:
# データセット読み込み
df = pd.read_parquet("/kaggle/input/otto-train-and-test-data-for-local-validation/train.parquet")
df = df[:1000]
df

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0
...,...,...,...,...
995,14,183758,1659568161,0
996,14,634673,1659568184,0
997,14,634673,1659568240,0
998,14,634673,1659568247,1


# 特徴量作成

In [21]:
def check_features(df, feature_df):
    """
    作成した特徴量データフレームのチェック
    """
    # nanが含まれていないか
    assert feature_df.isna().sum().sum() == 0, "NaN is included"
    
    # rowに重複が存在しないか
    assert len(feature_df[feature_df.duplicated()]) == 0, "There are duplicates"
    
    # 元となるデータフレームのユニークな列数と長さが一致しているか
    assert len(feature_df) == len(df.drop_duplicates(["session", "aid"])), "Length is incorrect"

In [22]:
# how many items has user already clicked
# (how many items has user already carted)
# how many items has user already ordered
def count_items_by_session(df, output_dir):
    """
    セッション内でクリックしたアイテム数、カートに入れたアイテム数、オーダーしたアイテム数
    """
    names = ["n_items_by_session_clicked", "n_items_by_session_carted", "n_items_by_session_ordered"]
    for type_number, name in enumerate(names):
        feature_df = df[df["type"] == type_number].groupby(
            "session"
        ).agg({"aid": "nunique"}).rename(columns={"aid": name}).reset_index()
        feature_df = pd.merge(
            df[["session", "aid"]],
            feature_df,
            on="session",
            how="left"
        ).fillna(0).drop_duplicates(["session", "aid"])
        
        # export
        check_features(df, feature_df)
        feature_df.to_parquet(join(output_dir, name + ".parquet"), index=False)

In [23]:
count_items_by_session(df, OUTPUT_DIR)

In [24]:
def count_past_items(df, output_dir):
    """
    セッション内で現時点より前にクリックしたアイテム数、カートに入れたアイテム数、オーダーしたアイテム数
    """
    # copyしないとwarning出るからしてるけど、これのせいでメモリ圧迫する？
    feature_df = df[["session", "aid"]].copy()
    unique_df = df.drop_duplicates(subset=["session", "aid", "type"])
    
    names = ["n_items_already_clicked", "n_items_already_carted", "n_items_already_ordered"]
    for type_number, name in enumerate(names):
        feature_df[name] = unique_df[unique_df["type"] == type_number].groupby("session").agg({"aid": "cumcount"})
        feature_df[name] = feature_df.groupby("session").agg({name: "ffill"}).fillna(0)
        feature_df = feature_df.drop_duplicates(["session", "aid"])
        
        # export
        check_features(df, feature_df)
        feature_df.to_parquet(join(output_dir, name + ".parquet"), index=False)
        feature_df.drop(name, axis=1, inplace=True)

In [25]:
count_past_items(df, OUTPUT_DIR)

In [26]:
# what is average hour that user clicks
# (what is average hour that user carts)
# what is average hour that user orders
def average_hour(df, output_dir):
    """
    セッション内での各アクションが発生している時間帯の平均値
    
    ※この特徴量意味ある？平均値じゃなくて最頻値とかの方がまだ良い気がするが
    """
    names = ["average_hour_clicks", "average_hour_carts", "average_hour_orders"]
    for type_number, name in enumerate(names):
        _df = df[df["type"] == type_number]
        ts = pd.to_datetime(_df["ts"], unit="s") + timedelta(hours=2)
        hour = ts.dt.hour
        feature_df = pd.concat([_df["session"], hour], axis=1)
        feature_df = feature_df.groupby(
            "session"
        ).agg({"ts": "mean"}).rename(columns={"ts": name}).reset_index()
        # アクションが存在しない場合は-1
        feature_df = pd.merge(
            df[["session", "aid"]],
            feature_df,
            on="session",
            how="left"
        ).fillna(-1).drop_duplicates(["session", "aid"])
        
        # export
        check_features(df, feature_df)
        feature_df.to_parquet(join(output_dir, name + ".parquet"), index=False)

In [27]:
average_hour(df, OUTPUT_DIR)

In [28]:
# how many real sessions does user have (real session define by time gap between activity)
def count_real_sessions(df, output_dir, sec_threshold=1800):
    """
    real session数をカウントする
    real sessionとは、アクション間の時間間隔が一定時間(sec_threshold)以内のまとまりとする
    
    ※セッション数ではなく分割されたセッション毎にインデックス振るとかの方がいいのでは？
    """
    name = f"n_real_sessions_within_{sec_threshold}s"
    
    diff = df.groupby("session").agg({"ts": "diff"}).rename(columns={"ts": "diff"})
    feature_df = pd.concat([df["session"], diff], axis=1)
    feature_df = feature_df[feature_df["diff"] > sec_threshold]
    feature_df = (feature_df.groupby("session").count() + 1).rename(columns={"diff": name}).reset_index()
    # session数＝real session数なら1
    feature_df = pd.merge(
        df[["session", "aid"]],
        feature_df,
        on="session",
        how="left"
    ).fillna(1).drop_duplicates(["session", "aid"])
    
    # export
    check_features(df, feature_df)
    feature_df.to_parquet(join(output_dir, name + ".parquet"), index=False)

In [29]:
count_real_sessions(df, OUTPUT_DIR)

In [30]:
pd.read_parquet("n_real_sessions_within_1800s.parquet")

Unnamed: 0,session,aid,n_real_sessions_within_1800s
0,0,1517085,41.0
1,0,1563459,41.0
2,0,1309446,41.0
3,0,16246,41.0
4,0,1781822,41.0
...,...,...,...
592,14,130225,6.0
593,14,1522105,6.0
594,14,183758,6.0
595,14,634673,6.0


In [31]:
# # what is average number of items in each user real session
# def average_n_items_by_real_sessions(df, output_dir, sec_threshold=1800):
#     """
#     real session毎でのアイテム数の平均値
    
#     ※count_real_sessionsとsec_thresholdの値を一致させる必要あり
#     """
#     name = f"average_n_items_by_real_sessions_within_{sec_threshold}s"
    

In [32]:
# what is last day of week user made activity (i.e. monday, tuesday)
def last_day_of_week(df, output_dir):
    """
    session内で最後にアクションのあった曜日、数値で取得する
    """
    name = "day_of_week_made_last_activity"
    
    ts = df.groupby("session").last()["ts"]
    ts = pd.to_datetime(ts, unit="s") + timedelta(hours=2)
    feature_df = pd.DataFrame(
        ts.dt.weekday
    ).rename(columns={"ts": name}).reset_index()
    feature_df = pd.merge(
        df[["session", "aid"]],
        feature_df,
        on="session",
        how="left"
    ).drop_duplicates(["session", "aid"])
    
    # export
    check_features(df, feature_df)
    feature_df.to_parquet(join(output_dir, name + ".parquet"), index=False)

In [33]:
last_day_of_week(df, OUTPUT_DIR)

In [34]:
# what is first day of week user made activity
def first_day_of_week(df, output_dir):
    """
    session内で最初にアクションのあった曜日、数値で取得する
    """
    name = "day_of_week_made_first_activity"
    
    ts = df.groupby("session").last()["ts"]
    ts = pd.to_datetime(ts, unit="s") + timedelta(hours=2)
    feature_df = pd.DataFrame(
        ts.dt.weekday
    ).rename(columns={"ts": name}).reset_index()
    feature_df = pd.merge(
        df[["session", "aid"]],
        feature_df,
        on="session",
        how="left"
    ).drop_duplicates(["session", "aid"])
    
    # export
    check_features(df, feature_df)
    feature_df.to_parquet(join(output_dir, name + ".parquet"), index=False)

In [35]:
first_day_of_week(df, OUTPUT_DIR)

In [36]:
# what is average time between clicks
def average_time_between_clicks(df, output_dir):
    """
    session毎のクリック間の平均時間
    
    ※途中にcartやorderが入っていても無視してクリックだけ見てるので、それでいいかは要検討
    ※cartとorderについては求めてないが、数が少ないからいらない？
    """
    name="average_time_between_clicks"
    
    diff = df[df["type"] == 0].groupby("session").agg({"ts": "diff"}).rename(columns={"ts": "diff"})
    feature_df = pd.concat([df["session"], diff], axis=1)
    feature_df = feature_df.groupby(
        "session"
    ).agg({"diff": "mean"}).rename(columns={"diff": name}).reset_index()
    feature_df = pd.merge(
        df[["session", "aid"]],
        feature_df,
        on="session",
        how="left"
    ).drop_duplicates(["session", "aid"])
    
    # export
    check_features(df, feature_df)
    feature_df.to_parquet(join(output_dir, name + ".parquet"), index=False)

In [37]:
average_time_between_clicks(df, OUTPUT_DIR)

In [38]:
# has this item already been clicked by user
