Example user features

- how many items has user already clicked
- how many items has user already ordered
- what is average hour that user clicks
- what is average hour that user orders
- how many real sessions does user have (real session define by time gap between activity)
- what is average number of items in each user real session
- what is last day of week user made activity (i.e. monday, tuesday)
- what is first day of week user made activity
- what is average time between clicks

Example item features

- has this item already been clicked by user
- has this item already been added to cart by user
- if already clicked, what is its relative order? 1 means last clicked, 2 means second to last clicked etc
- has user clicked this item multiple times already? how many
- how many items (that user has already clicked) have recommended this item with their co-visitation matrix
- when was date that this item was first seen in train
- how many times what this item clicked in train
- what is the average hour of day that this item is clicked
- what is the average hour of day that this item is ordered
- how popular is this item on monday (i.e. what percentage of monday clicks are this item)
- how popular is this item on tuesday
- what is the most common day of week this item is clicked
- count up all unique items that were clicked immediately before and after. How many unique items have been clicked immediately before and after. (For example, maybe item only has 10 unique items that get clicked before and after. Whereas another item has 1000 unique items clicked before and after)
- what percentage of users click this item more than once
- has this item ever been bought in train data

In [55]:
from datetime import timedelta

import pandas as pd

In [56]:
df = pd.read_parquet("/kaggle/input/otto-train-and-test-data-for-local-validation/train.parquet")
df = df[:1000]
df

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0
...,...,...,...,...
995,14,183758,1659568161,0
996,14,634673,1659568184,0
997,14,634673,1659568240,0
998,14,634673,1659568247,1


In [57]:
# how many items has user already clicked
# how many items has user already ordered
def count_type(df, type_number=0, column_name="n_clicks"):
    """
    user(session) x item(aid)に対するaction(type)の回数
    """
    feature_df = df[df["type"] == type_number].groupby(["session", "aid"]).agg({"type": "count"})
    feature_df.columns = [column_name]
    return feature_df.reset_index()

In [58]:
count_type(df)

Unnamed: 0,session,aid,n_clicks
0,0,16246,1
1,0,30373,1
2,0,97836,1
3,0,102416,1
4,0,154930,1
...,...,...,...
586,14,1747636,2
587,14,1764092,1
588,14,1790738,1
589,14,1815907,3


In [59]:
# what is average hour that user clicks
# what is average hour that user orders
def average_hour(df, type_number=0, column_name="clicks_average_hour"):
    """
    アクション毎の時間帯の平均
    """
    _df = df[df["type"] == type_number]
    ts = pd.to_datetime(_df["ts"], unit="s") + timedelta(hours=2)
    hour = ts.dt.hour
    feature_df = pd.concat([_df["session"], hour], axis=1)
    feature_df = feature_df.groupby("session").agg({"ts": "mean"}).rename(columns={"ts": column_name})
    return feature_df.reset_index()

In [60]:
average_hour(df)

Unnamed: 0,session,clicks_average_hour
0,0,14.042254
1,1,17.894737
2,2,1.769231
3,3,12.53
4,4,0.0
5,5,1.5
6,6,10.413793
7,7,4.652174
8,8,0.0
9,9,7.333333


In [61]:
# how many real sessions does user have (real session define by time gap between activity)
def count_real_sessions(df, sec_threshold=1800, column_name="n_real_sessions"):
    """
    real session数をカウントする
    アクション間の時間間隔が一定時間(sec_threshold)以内のまとまりをreal sessionとする
    """
    diff = df.groupby("session").agg({"ts": "diff"}).rename(columns={"ts": "diff"})
    feature_df = pd.concat([df["session"], diff], axis=1)
    feature_df = feature_df[feature_df["diff"] > sec_threshold]
    feature_df = (feature_df.groupby("session").count() + 1).rename(columns={"diff": column_name})
    feature_df = pd.merge(
        pd.DataFrame(df["session"].unique(), columns=["session"]),
        feature_df,
        on="session",
        how="outer"
    ).fillna(0).astype(int)
    return feature_df

In [62]:
count_real_sessions(df)

Unnamed: 0,session,n_real_sessions
0,0,41
1,1,7
2,2,3
3,3,26
4,4,0
5,5,3
6,6,11
7,7,6
8,8,0
9,9,3


In [63]:
# # what is average number of items in each user real session
# def count_items_by_real_sessions(df, column_name="n_items"):
#     """
#     real session毎でのアイテム数の平均値
#     """
    

In [64]:
# what is last day of week user made activity (i.e. monday, tuesday)
def last_day_of_week(df, column_name="last_day_of_week"):
    """
    session毎の最後の曜日を数値で取得する
    """
    ts = df.groupby("session").last()["ts"]
    ts = pd.to_datetime(ts, unit="s") + timedelta(hours=2)
    feature_df = pd.DataFrame(ts.dt.weekday).rename(columns={"ts": column_name})
    return feature_df.reset_index()

In [65]:
last_day_of_week(df)

Unnamed: 0,session,last_day_of_week
0,0,6
1,1,3
2,2,0
3,3,6
4,4,0
5,5,5
6,6,4
7,7,0
8,8,0
9,9,3


In [66]:
# what is first day of week user made activity
def first_day_of_week(df, column_name="first_day_of_week"):
    """
    session毎の最初の曜日を数値で取得する
    """
    ts = df.groupby("session").first()["ts"]
    ts = pd.to_datetime(ts, unit="s") + timedelta(hours=2)
    feature_df = pd.DataFrame(ts.dt.weekday).rename(columns={"ts": column_name})
    return feature_df.reset_index()

In [67]:
first_day_of_week(df)

Unnamed: 0,session,first_day_of_week
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


In [68]:
# what is average time between clicks
def average_time_between_clicks(df, column_name="average_time_between_clicks"):
    """
    session毎のクリック間の平均時間
    途中にcartやorderが入っていても無視してクリックだけ見てるので、それでいいかは要検討
    """
    diff = df[df["type"] == 0].groupby("session").agg({"ts": "diff"}).rename(columns={"ts": "diff"})
    feature_df = pd.concat([df["session"], diff], axis=1)
    feature_df = feature_df.groupby("session").agg({"diff": "mean"}).rename(columns={"diff": column_name})
    return feature_df.reset_index()

In [69]:
average_time_between_clicks(df)

Unnamed: 0,session,average_time_between_clicks
0,0,12758.347518
1,1,86234.166667
2,2,106048.25
3,3,9069.562814
4,4,36.0
5,5,80306.692308
6,6,9462.248555
7,7,56078.090909
8,8,13.0
9,9,68666.4


In [70]:
# has this item already been clicked by user
