In [85]:
import datetime
import pandas as pd
import numpy as np
import pickle
import os
import csv

# Dataset information

## Original Dataset
https://tianchi.aliyun.com/dataset/dataDetail?dataId=649

Random select about 1 million users who have behaviors including click, purchase, adding item to shopping cart and item favoring during November 25 to December 03, 2017. The dataset is organized in a very similar form to MovieLens-20M, i.e. Each line represents a specific user-item interaction, which consists of user ID, item ID, item's category ID, behavior type and timestamp, separated by commas.

### Features
The detailed descriptions of each field are as follows:

|Field     | Explanation |
| ----------- | ----------- |
|User ID | An integer, the serialized ID that represents a user|
|Item ID | An integer, the serialized ID that represents an item|
|Category ID | An integer, the serialized ID that represents the category which the corresponding item belongs to|
|Behavior type | A string, enum-type from ('pv', 'buy', 'cart', 'fav')|
|Timestamp | An integer, the timestamp of the behavior|

### Behavior
The dataset contains 4 different types of behaviors:

|Behavior     | Explanation |
| ----------- | ----------- |
|pv | Page view of an item's detail page, equivalent to an item click|
|buy | Purchase an item|
|cart | Add an item to shopping cart|
|fav | Favor an item|

### Size
Dimensions of the dataset are:

|Dimension    |  Number  |
| ----------- | ----------- |
|# of users | 987,994|
|# of items | 4,162,024|
|# of categories | 9,439|
|# of interactions | 100,150,807|

## Processed Dataset
1. Extract November 25, 2017 interactions information from original dataset.
2. Treat all interactions as positive interactions
3. Reserve the earlier interaction record of each user-item pair.
4. Drop top 0.5% items 
5. Iteratively drop sparse users and items until all users and items have 10 more interactions
6. Valid and Test dataset only contains warm users and warm items of Train dataset

### Size
Dimensions of the dataset are:

|Type|# of users|# of items|# of interactions|
| ----------- | ----------- | ----------- | ----------- |
|Train|51635|44238|687600|
|Valid|22792|39958|131125|
|Test|19529|38382|111701|

In [97]:
def df_to_uipair_dict(df):
    df_temp = df.copy()[['user_id','item_id']]
    df_temp = df_temp.groupby('user_id')['item_id'].apply(list).to_dict()
    return df_temp


def count_data_info(df):
    statistic_dict = {}
    for i in ['user_id', 'item_id']:
        data_feature_num = df[i].value_counts(normalize = False, dropna = True).to_dict()        
        statistic_dict[i] = data_feature_num
    statistic_dict['interactions'] = np.sum(np.asarray(np.array(list(statistic_dict['user_id'].items()))[:,1], dtype = int))
    return statistic_dict


def warm_user_warm_item(df,train_statistic_dict):
    df_out = df[(df['user_id'].isin(train_statistic_dict['user_id'])) & (df['item_id'].isin(train_statistic_dict['item_id']))].copy()
    df_out.reset_index(drop = True,inplace = True)
    return df_out


def df_to_uipair_txt(df,ui_to_id_dict, traget_path):
    df_temp = df.copy()

    df_temp['user_id'] = df_temp['user_id'].apply(lambda x:ui_to_id_dict['user_id'][x])
    df_temp['item_id'] = df_temp['item_id'].apply(lambda x:ui_to_id_dict['item_id'][x])

    df_temp = df_temp.astype(str)[['user_id','item_id']]
    df_temp = df_temp.groupby(['user_id'], as_index=False).apply(lambda x: '\t'.join(x['item_id'])).reset_index(drop=True)
    df_temp.to_csv(traget_path, header=None, index=False, sep='\t', quoting=csv.QUOTE_NONE, escapechar=' ')


def remove_top_value(df_uni,top):
    topK = top
    item_num = df_uni.groupby('item_id',as_index=False).count().shape[0]
    pop_item = df_uni.groupby('item_id',as_index=False).count().sort_values('user_id',ascending=False).head(int(item_num*topK/100))['item_id'].tolist()
    print('popular item num:',len(pop_item))

    df_rmpop = df_uni[~df_uni['item_id'].isin(pop_item)]
    return df_rmpop


def select_kcore(interaction_dict,K=3):
    flag = 0
    while flag==0:
        item_cnt_dict = {}
        item_drop_dict = {}
        # create item_drop_dict, item_cnt_dict
        for user_id in interaction_dict:
            for item_id in interaction_dict[user_id]:
                if item_id not in item_cnt_dict:
                    item_cnt_dict[item_id] = 0
                    item_drop_dict[item_id] = 0
                item_cnt_dict[item_id] += 1

        #print('user num:',len(interaction_dict))
        assert len(item_drop_dict)==len(item_cnt_dict)

        # delete items < K
        del_iid_list = []
        for item_id in item_cnt_dict:
            if item_cnt_dict[item_id] < K:
                del_iid_list.append(item_id)

        for item_id in del_iid_list:
            item_drop_dict[item_id] = 1
        for user_id in interaction_dict:
            del_id_list = []
            for item_id in interaction_dict[user_id]:
                if item_drop_dict[item_id]:
                    del_id_list.append(item_id)
            for del_id in del_id_list:
                if del_id in interaction_dict[user_id]:
                    interaction_dict[user_id].remove(del_id)

        item_drop_num = 0
        for item_id in item_drop_dict:
            item_drop_num += item_drop_dict[item_id]
        item_num = len(item_drop_dict) - item_drop_num
        #print(f'item num after item-{K}core:',item_num)

        new_item_cnt = {}
        min_cnt=999
        for user_id in interaction_dict:
            min_cnt = min(min_cnt, len(interaction_dict[user_id]))
            for item_id in interaction_dict[user_id]:
                if item_id not in new_item_cnt:
                    new_item_cnt[item_id] = 0
                new_item_cnt[item_id] += 1
        #print('min user interaction:',min_cnt)
        min_cnt_item = 999
        for item_id in new_item_cnt:
            min_cnt_item = min(min_cnt_item, new_item_cnt[item_id])
        #print('min item num:',min_cnt_item)
        if min_cnt>=K and min_cnt_item>=K:
            return interaction_dict, len(interaction_dict), item_num
        
        # delete users interactions<K
        del_uid_list = []
        for user_id in interaction_dict:
            if len(interaction_dict[user_id])<K:
                del_uid_list.append(user_id)
        for user_id in del_uid_list:
            del interaction_dict[user_id]
        
        # count min user-interaction and item appearance
        new_item_cnt = {}
        min_cnt=999
        for user_id in interaction_dict:
            min_cnt = min(min_cnt, len(interaction_dict[user_id]))
            for item_id in interaction_dict[user_id]:
                if item_id not in new_item_cnt:
                    new_item_cnt[item_id] = 0
                new_item_cnt[item_id] += 1
        min_cnt_item = 999
        for item_id in new_item_cnt:
            min_cnt_item = min(min_cnt_item, new_item_cnt[item_id])
            
        if min_cnt>=K and min_cnt_item>=K:
            return interaction_dict, len(interaction_dict), item_num


def remove_sparse_interaction(df,uipair_dict,K=3):
    interaction_dict, not_sparse_user, not_sparse_item = select_kcore(uipair_dict,K)
    
    not_sparse_dict = {'user_id':set([]),'item_id':set([])}
    
    for k,v in interaction_dict.items():
        not_sparse_dict['user_id'].add(k)
        for i in v:
            not_sparse_dict['item_id'].add(i)

    df_remove_sparse = df[(df['user_id'].isin(not_sparse_dict['user_id'])) & (df['item_id'].isin(not_sparse_dict['item_id']))].copy()
    return df_remove_sparse, not_sparse_dict


def save_ui_dict(not_sparse_dict,target_path):
    ui_to_id_dict = {'user_id':{},'item_id':{}}

    for user in list(not_sparse_dict['user_id']):
        if user not in ui_to_id_dict['user_id'].keys():
            ui_to_id_dict['user_id'][user] = len(ui_to_id_dict['user_id'])

    for item in list(not_sparse_dict['item_id']):
        if item not in ui_to_id_dict['item_id'].keys():
            ui_to_id_dict['item_id'][item] = len(ui_to_id_dict['item_id'])
            
    dict_file = open(target_path + 'ui_to_id.dict', 'wb') 
    pickle.dump(ui_to_id_dict,dict_file)
    dict_file.close()

    id_to_ui_dict = {'user_id':{v:k for k,v in ui_to_id_dict['user_id'].items()},'item_id':{v:k for k,v in ui_to_id_dict['item_id'].items()}}
    
    dict_file = open(target_path + 'id_to_ui.dict', 'wb') 
    pickle.dump(id_to_ui_dict,dict_file)
    dict_file.close()
    
    dict_file = open(target_path + 'dataset_size.txt', 'w') 
    dict_file.write("{} {}".format(str(len(ui_to_id_dict['user_id'])),str(len(ui_to_id_dict['item_id']))))
    dict_file.close()
    
    return ui_to_id_dict,id_to_ui_dict

In [3]:
base_path = './'

In [4]:
data=pd.read_csv(base_path + 'data/UserBehavior.csv',encoding='utf')
data.columns=['user_id','item_id','category_id','behavior','time']

def num_to_time(x):
    return datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')

data['time'] = data['time'].apply(num_to_time)
data['time'] = pd.to_datetime(data['time'],format='%Y-%m-%d %H:%M:%S')

In [5]:
data.head()

Unnamed: 0,user_id,item_id,category_id,behavior,time
0,1,2333346,2520771,pv,2017-11-25 06:15:33
1,1,2576651,149192,pv,2017-11-25 09:21:25
2,1,3830808,4181361,pv,2017-11-25 15:04:53
3,1,4365585,2520377,pv,2017-11-25 15:49:06
4,1,4606018,2735466,pv,2017-11-25 21:28:01


In [10]:
start_time = pd.Timestamp('2017-11-25 00:00:00')
end_time = pd.Timestamp('2017-11-26 00:00:00')

In [16]:
sampledata = data[(data['time']>=start_time) & (data['time']<end_time)][['user_id','item_id','time']]

In [12]:
sampledata.shape

(10420014, 5)

In [13]:
sample_user_id_value_counts = sampledata['user_id'].value_counts()
sample_item_id_value_counts = sampledata['item_id'].value_counts()
print(sample_user_id_value_counts,sample_item_id_value_counts)

In [27]:
unique_data = sampledata.groupby(['user_id', 'item_id'], as_index=False)['time'].min()
unique_data = unique_data.reset_index(drop=True)

In [28]:
unique_data.shape

(8786839, 3)

In [None]:
# unique_data.to_csv(base_path + 'data/1days_unique_data.csv') 

In [153]:
unique_data=pd.read_csv(base_path + 'data/1days_unique_data.csv',encoding='utf',index_col=[0])

In [154]:
unique_data.shape

(8786839, 3)

In [None]:
sample_user_id_value_counts = unique_data['user_id'].value_counts()
sample_item_id_value_counts = unique_data['item_id'].value_counts()
print(sample_user_id_value_counts,sample_item_id_value_counts)

In [166]:
unique_data.shape

(8786839, 3)

In [216]:
top = 0.3
sparse = 11

In [217]:
remove_top_data = remove_top_value(unique_data,top)
remove_top_uipair_dict = df_to_uipair_dict(remove_top_data)
df_remove_top_sparse, not_sparse_dict = remove_sparse_interaction(remove_top_data,remove_top_uipair_dict,sparse)
ui_to_id_dict_sparse,id_to_ui_dict_sparse = save_ui_dict(not_sparse_dict,base_path+'data/')

popular item num: 4834


In [218]:
df_remove_top_sparse.shape

(989989, 3)

In [184]:
# df_remove_top_sparse.to_csv(base_path + 'data/1day_unique_Remove_top{}_sparse{}.csv'.format(str(top),str(sparse))) 

In [119]:
df_remove_top_sparse=pd.read_csv(base_path + 'data/1day_unique_Remove_top{}_sparse{}.csv'.format(str(top),str(sparse)),encoding='utf',index_col=[0])

In [None]:
sample_user_id_value_counts = df_remove_top_sparse['user_id'].value_counts()
sample_item_id_value_counts = df_remove_top_sparse['item_id'].value_counts()
print(sample_user_id_value_counts,sample_item_id_value_counts)

In [220]:
df_remove_top_sparse['time'] = pd.to_datetime(df_remove_top_sparse['time'])

In [238]:
stage_info_dict = {}
train_stage = ['stage_'+str(i) for i in range(10)]
valid_stage = ['stage_'+str(i) for i in range(8,10)]
test_stage = ['stage_'+str(i) for i in range(9,11)]

stage_info_dict['train'] = train_stage
stage_info_dict['valid'] = valid_stage
stage_info_dict['test'] = test_stage

statistic_dict_file = open(base_path + 'data/stage_info_dict.dict', 'wb') 
pickle.dump(stage_info_dict,statistic_dict_file)
statistic_dict_file.close()

In [None]:
stage_info_dict

In [222]:
hour_to_stage_map_dict = {}
stage_cnt = 0
for hour in range(22):
    hour_to_stage_map_dict[hour] = 'stage_' + str(int(stage_cnt/2))
    stage_cnt += 1

In [None]:
hour_to_stage_map_dict

In [224]:
stage_time = [(pd.Timestamp('2017-11-25 00:00:00')+pd.Timedelta(minutes=120*i),pd.Timestamp('2017-11-25 1:59:59')+pd.Timedelta(minutes=120*i)) for i in range(11)]

if os.path.exists(base_path + 'data/stage_item_popularity_dict.dict'):
    print('loading from file')
    with open(base_path + 'data/stage_item_popularity_dict.dict','rb') as f:
        stage_item_popularity_dict = pickle.load(f)
else:
    print('generating stage_item_popularity_dict')
    stage_item_popularity_dict = {}
    for i in range(len(stage_time)):
        start_time = stage_time[i][0]
        end_time = stage_time[i][1]
        df_onestage = df_remove_top_sparse[['user_id','item_id']][(df_remove_top_sparse['time']>=start_time) & (df_remove_top_sparse['time'] <= end_time)].copy()
        iid_inter_count_onestage = df_onestage.groupby('item_id',as_index=False).count()
        iid_inter_count_onestage = iid_inter_count_onestage[['user_id','item_id']]
        iid_inter_count_onestage.rename(columns={'item_id': 'item_id', 'user_id': 'count_'+ str(i)}, inplace=True)
        interactions_onestage = iid_inter_count_onestage['count_'+ str(i)].sum()
        iid_inter_count_onestage['count_'+ str(i)] = iid_inter_count_onestage['count_'+ str(i)]/interactions_onestage
        iid_inter_count_onestage.set_index(['item_id'], inplace = True)
        stage_item_popularity_dict['stage_'+str(i)] = iid_inter_count_onestage.to_dict('dict')['count_'+ str(i)]
    
    for k,v in stage_item_popularity_dict.items():
        deno_normal = max(list(v.values())) - min(list(v.values()))
        for k1,v1 in v.items():
            stage_item_popularity_dict[k][k1] = v1/deno_normal
    
    statistic_dict_file = open(base_path + 'data/stage_item_popularity_dict.dict', 'wb') 
    pickle.dump(stage_item_popularity_dict,statistic_dict_file)
    statistic_dict_file.close()

generating stage_item_popularity_dict


In [None]:
stage_item_popularity_dict

In [226]:
stage_time

[(Timestamp('2017-11-25 00:00:00'), Timestamp('2017-11-25 01:59:59')),
 (Timestamp('2017-11-25 02:00:00'), Timestamp('2017-11-25 03:59:59')),
 (Timestamp('2017-11-25 04:00:00'), Timestamp('2017-11-25 05:59:59')),
 (Timestamp('2017-11-25 06:00:00'), Timestamp('2017-11-25 07:59:59')),
 (Timestamp('2017-11-25 08:00:00'), Timestamp('2017-11-25 09:59:59')),
 (Timestamp('2017-11-25 10:00:00'), Timestamp('2017-11-25 11:59:59')),
 (Timestamp('2017-11-25 12:00:00'), Timestamp('2017-11-25 13:59:59')),
 (Timestamp('2017-11-25 14:00:00'), Timestamp('2017-11-25 15:59:59')),
 (Timestamp('2017-11-25 16:00:00'), Timestamp('2017-11-25 17:59:59')),
 (Timestamp('2017-11-25 18:00:00'), Timestamp('2017-11-25 19:59:59')),
 (Timestamp('2017-11-25 20:00:00'), Timestamp('2017-11-25 21:59:59'))]

In [227]:
print('generating stage_iid_popularity_dict')
stage_iid_popularity_dict = {}
for k,v in stage_item_popularity_dict.items():
    stage_iid_popularity_dict[k] = {}
    for k1,v1 in v.items():
        stage_iid_popularity_dict[k][ui_to_id_dict_sparse['item_id'][k1]] = v1

statistic_dict_file = open(base_path + 'data/stage_iid_popularity_dict.dict', 'wb') 
pickle.dump(stage_iid_popularity_dict,statistic_dict_file)
statistic_dict_file.close()

generating stage_iid_popularity_dict


In [None]:
stage_iid_popularity_dict

In [229]:
temp_df_uidate_popular = df_remove_top_sparse[['user_id','item_id','time']]
print(temp_df_uidate_popular.shape)

if os.path.exists(base_path + 'data/mappedui_to_popularity_dict.dict'):
    print('loading from file')
    with open(base_path + 'data/mappedui_to_popularity_dict.dict','rb') as f:
        mappedui_to_popularity_dict = pickle.load(f)
else:
    print('generating mappedui_to_popularity_dict')
    mappedui_to_popularity_dict = {}
    for idx, row in temp_df_uidate_popular.iterrows():
        if row['time'].hour in hour_to_stage_map_dict:
            ui_index = (ui_to_id_dict_sparse['user_id'][row['user_id']],ui_to_id_dict_sparse['item_id'][row['item_id']])
            mappedui_to_popularity_dict[ui_index] = stage_item_popularity_dict[hour_to_stage_map_dict[row['time'].hour]][row['item_id']]

    statistic_dict_file = open(base_path + 'data/mappedui_to_popularity_dict.dict', 'wb') 
    pickle.dump(mappedui_to_popularity_dict,statistic_dict_file)
    statistic_dict_file.close()

(989989, 3)
generating stage_item_popularity_dict


In [None]:
mappedui_to_popularity_dict

In [235]:
temp_df_uidate_popular = df_remove_top_sparse[['user_id','item_id','time']]
print(temp_df_uidate_popular.shape)

if os.path.exists(base_path + 'data/mappedui_stage_popularity_dict.dict'):
    print('loading from file')
    with open(base_path + 'data/mappedui_stage_popularity_dict.dict','rb') as f:
        mappedui_stage_popularity_dict = pickle.load(f)
else:
    print('generating mappedui_stage_popularity_dict')
    mappedui_stage_popularity_dict = {}
    for idx, row in temp_df_uidate_popular.iterrows():
        if row['time'].hour in hour_to_stage_map_dict:
            ui_index = (ui_to_id_dict_sparse['user_id'][row['user_id']],ui_to_id_dict_sparse['item_id'][row['item_id']])
            mappedui_stage_popularity_dict[ui_index] = hour_to_stage_map_dict[row['time'].hour]

    statistic_dict_file = open(base_path + 'data/mappedui_stage_popularity_dict.dict', 'wb') 
    pickle.dump(mappedui_stage_popularity_dict,statistic_dict_file)
    statistic_dict_file.close()

(989989, 3)
generating mappedui_stage_popularity_dict


In [None]:
mappedui_stage_popularity_dict

In [231]:
train_start_time =  pd.Timestamp('2017-11-25 00:00:00')
train_end_time = pd.Timestamp('2017-11-25 19:59:59')
val_start_time = pd.Timestamp("2017-11-25 20:00:00")
val_end_time = pd.Timestamp("2017-11-25 21:59:59")
test_start_time = pd.Timestamp("2017-11-25 22:00:00")
test_end_time = pd.Timestamp("2017-11-25 23:59:59")

In [232]:
# train
df_train = df_remove_top_sparse[(df_remove_top_sparse['time']>=train_start_time) & (df_remove_top_sparse['time']<=train_end_time)].copy()

df_train_statistic_dict = count_data_info(df_train)
print()
print("train:")
print("interactions:",df_train_statistic_dict["interactions"],"\nusers:",len(df_train_statistic_dict['user_id']),"\nitems:",len(df_train_statistic_dict['item_id']))

df_to_uipair_txt(df_train, ui_to_id_dict_sparse, base_path + 'data/train.txt')


# valid
# valid warm user warm item
df_val = df_remove_top_sparse[(df_remove_top_sparse['time']>=val_start_time) & (df_remove_top_sparse['time']<=val_end_time)].copy()

df_wuwi_val = warm_user_warm_item(df_val, df_train_statistic_dict)

df_val_statistic_dict = count_data_info(df_wuwi_val)
print()
print("val_wuwi:")
print("interactions:",df_val_statistic_dict["interactions"],"\nusers:",len(df_val_statistic_dict['user_id']),"\nitems:",len(df_val_statistic_dict['item_id']))

df_to_uipair_txt(df_wuwi_val, ui_to_id_dict_sparse, base_path + 'data/valid.txt')


# test
df_test = df_remove_top_sparse[(df_remove_top_sparse['time']>=test_start_time) & (df_remove_top_sparse['time']<=test_end_time)].copy()

# valid warm user warm item
df_wuwi_test = warm_user_warm_item(df_test, df_train_statistic_dict)

df_test_statistic_dict = count_data_info(df_wuwi_test)
print()
print("test_wuwi:")
print("interactions:",df_test_statistic_dict["interactions"],"\nusers:",len(df_test_statistic_dict['user_id']),"\nitems:",len(df_test_statistic_dict['item_id']))

df_to_uipair_txt(df_wuwi_test, ui_to_id_dict_sparse, base_path + 'data/test.txt')


train:
interactions: 687600 
users: 51635 
items: 44238

val_wuwi:
interactions: 131125 
users: 22792 
items: 39958

test_wuwi:
interactions: 111701 
users: 19529 
items: 38382


In [233]:
df_remove_top_sparse.shape

(989989, 3)

In [234]:
687600+131125+111701

930426