In [1]:
import pandas as pd

In [2]:
# Read in processed category_item_filter file and remove the feature column added before by Shuaige
category_item_filter_df = pd.read_csv('./category_item_filter.csv')
category_item_filter_df.drop('features', axis=1, inplace=True)
category_item_filter_df.columns = ['itemid', 'category_id']
category_item_filter_df.head()

Unnamed: 0,itemid,category_id
0,43511,1179
1,54408,209
2,10006,1694
3,49432,1280
4,39563,1196


In [3]:
# Read in item_properties csv files
df1 = pd.read_csv('./item_properties_part1.csv')
df2 = pd.read_csv('./item_properties_part2.csv')
item_properties_df = pd.concat([df1, df2])
assert(len(item_properties_df) == 20275902)
item_properties_df.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [4]:
# Read in events csv file
events_df = pd.read_csv('./events.csv')
events_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


#### Item Feature 1: Popularity of items based on events

In [5]:
# Assign weights to each event type
event_weights = {'view': 1, 'addToCart': 10, 'transaction': 100}

# Calculate the weighted popularity of each item
item_popularity = events_df.groupby('itemid')['event'].apply(lambda x: sum(event_weights.get(e, 0) for e in x)).reset_index(name='popularity')

print(item_popularity)

        itemid  popularity
0            3           2
1            4           3
2            6          29
3            9           2
4           15         118
...        ...         ...
235056  466861         189
235057  466862           4
235058  466863           1
235059  466864          89
235060  466867           4

[235061 rows x 2 columns]


In [7]:
# Left join based on the itemid
category_item_filter_w_popularity_df = category_item_filter_df.merge(item_popularity, on='itemid', how='left')

# Fill NA values with 0
category_item_filter_w_popularity_df.fillna(0, inplace=True)

print(category_item_filter_w_popularity_df)

       itemid  category_id  popularity
0       43511         1179         0.0
1       54408          209         5.0
2       10006         1694         7.0
3       49432         1280         1.0
4       39563         1196         6.0
...       ...          ...         ...
70847   12116         1255         0.0
70848   43011         1397         0.0
70849   14255          479         0.0
70850   29598         1287         0.0
70851   46971         1483       202.0

[70852 rows x 3 columns]


#### Item Feature 2: Most frequent item property with the most recent timestamp

In [9]:
# Convert timestamp to datetime format
item_properties_df['timestamp'] = pd.to_datetime(item_properties_df['timestamp'], unit='ms')

# Group by itemId and find the most frequent property with the most recent timestamp
most_frequent_properties = item_properties_df.groupby('itemid').apply(lambda x: x[x['timestamp'] == x['timestamp'].max()]['property'].value_counts().index[0]).reset_index(name='most_frequent_property')

print(most_frequent_properties)

        itemid most_frequent_property
0            0                      6
1            1                    790
2            2                   1063
3            3                    283
4            4                    888
...        ...                    ...
417048  466862                    810
417049  466863                    227
417050  466864                    790
417051  466865                    888
417052  466866                    756

[417053 rows x 2 columns]


In [14]:
# Left join based on the itemid
category_item_filter_features_df = category_item_filter_w_popularity_df.merge(most_frequent_properties, on='itemid', how='left')

# Dedup
category_item_filter_features_df = category_item_filter_features_df.drop_duplicates(subset='itemid', keep='first')

print(category_item_filter_features_df)

       itemid  category_id  popularity most_frequent_property
0       43511         1179         0.0                    888
1       54408          209         5.0                    189
2       10006         1694         7.0                    765
3       49432         1280         1.0                    888
4       39563         1196         6.0                    348
...       ...          ...         ...                    ...
70847   12116         1255         0.0                    839
70848   43011         1397         0.0                    914
70849   14255          479         0.0                    213
70850   29598         1287         0.0                    243
70851   46971         1483       202.0                   1032

[70852 rows x 4 columns]


In [15]:
category_item_filter_features_df.to_csv('category_item_filter_2.csv')