In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('sk/1year.csv')

In [4]:
data.shape

(229244382, 3)

In [6]:
data.head()

Unnamed: 0,dwh_phone_id,dwh_product_id,quantity
0,2,23,2
1,2,285,1
2,2,344,1
3,2,443,3
4,2,754,1


In [7]:
data.dwh_phone_id.nunique(), data.dwh_product_id.nunique()

(5531840, 25713)

In [8]:
229244382 / (5531840 * 25713)

0.0016116708139161224

In [171]:
# sample users
rng = np.random.default_rng(1)
sampled_users = rng.choice(data.dwh_phone_id.unique(), size=2800) #25000 for 1m
sampled_data = data.loc[data.dwh_phone_id.isin(sampled_users), :]

In [172]:
sampled_data.shape[0], sampled_data.dwh_phone_id.nunique(), sampled_data.dwh_product_id.nunique()

(111936, 2800, 10354)

In [173]:
# filter users with >3 items
item_count = sampled_data.groupby('dwh_phone_id').count().reset_index()
good_users = item_count.loc[item_count['dwh_product_id'] > 3, 'dwh_phone_id']
# filter items with >3 users
user_count = sampled_data.groupby('dwh_product_id').count().reset_index()
good_items = user_count.loc[user_count['dwh_phone_id'] > 3, 'dwh_product_id']

good_data = sampled_data.loc[sampled_data.dwh_phone_id.isin(good_users) & sampled_data.dwh_product_id.isin(good_items)].copy()

In [174]:
good_data.shape[0], good_data.dwh_phone_id.nunique(), good_data.dwh_product_id.nunique()

(102536, 2447, 5057)

In [175]:
# relabel users and items
good_data['user_id'] = np.unique(good_data['dwh_phone_id'], return_inverse=True)[1]
good_data['item_id'] = np.unique(good_data['dwh_product_id'], return_inverse=True)[1]
good_data = good_data.reset_index(drop=True)

In [176]:
good_data[['user_id', 'item_id']].to_csv('data_100k.csv', index=False)

In [92]:
100 * good_data.shape[0] / (good_data.dwh_phone_id.nunique() * good_data.dwh_product_id.nunique())

0.28840979488388

In [124]:
def train_test_split_single(data, how='last', seed=1):
    cols = ['user_id', 'item_id']
    if how == 'last':
        data_test = data.groupby('user_id').nth(-1).reset_index() # last item as test set
        data_valid = data.groupby('user_id').nth(-2).reset_index() # 2nd last item as valid set
        data_train = pd.concat([data, data_test, data_valid]).drop_duplicates(keep=False).reset_index(drop=True)
        return data_train[cols], data_valid[cols], data_test[cols]
    elif how == 'random':
        data_test = data.groupby('user_id').sample(random_state=seed).reset_index()
        data_train = pd.concat([data, data_test]).drop_duplicates(keep=False).reset_index(drop=True)
        data_valid = data_train.groupby('user_id').sample(random_state=seed).reset_index()
        data_train = pd.concat([data_train, data_valid]).drop_duplicates(keep=False).reset_index(drop=True)
        return data_train[cols], data_valid[cols], data_test[cols]

In [122]:
data_train, data_valid, data_test = train_test_split_single(good_data, how='random')

In [178]:
data_train

Unnamed: 0,user_id,item_id
0,0,1
1,0,49
2,0,63
3,0,66
4,0,79
...,...,...
1087385,21902,10173
1087386,21903,13641
1087387,21904,2020
1087388,21905,9290


In [177]:
data_valid

Unnamed: 0,user_id,item_id
0,0,1718
1,1,9105
2,2,6558
3,3,15861
4,4,954
...,...,...
21902,21902,10173
21903,21903,13641
21904,21904,2020
21905,21905,9290


In [10]:
d = pd.read_csv('sk/data_100k.csv')

In [16]:
d.shape[0] / (d.user_id.max() * d.item_id.max())

0.00829111336514278