In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
df = pd.read_csv('interactions.csv')

df.columns = ['user', 'item']

In [11]:
# Remove non-pop items.
cnt = df.item.value_counts()
rare_item = cnt[cnt < 100].index

print(df.user.unique().size)
df = df[~df.item.isin(rare_item)]
print(df.user.unique().size)

In [4]:
# One-hot encoding, label from 1 to n
from sklearn.preprocessing import LabelEncoder

for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col]) + 1  # Zero for padding, so remove it.
del le

In [5]:
df.head(3)

Unnamed: 0,user,item
0,1,2359
1,1,10671
2,1,19751


In [6]:
users = df.user.unique()
items = df.item.unique()

In [43]:
# Train test split
NUM_TEST_USERS = 500

np.random.seed(123)
test_users = np.random.choice(users, size=NUM_TEST_USERS)  # Random select some users for test

train = df[~df.user.isin(test_users)]
train_items = train['item'].unique()  # All items in trainset

test = df[df.user.isin(test_users)]
test = test[test['item'].isin(train_items)]  # Remove items that not in trainset

del df
print(len(train), len(test))

# Get top 80% interactions for each user.
TOP_FRAC = 0.8
def _get_top(x, frac=0.8):
    n_top = math.ceil(len(x) * frac)
    return x.head(n_top)
test_first = test.groupby('user').apply(_get_top, frac=TOP_FRAC).reset_index(drop=True)

def _get_tail(x, frac=0.2):
    n_tail = math.floor(len(x)*frac)
    return x.tail(n_tail)
test_second = test.groupby('user').apply(_get_tail, frac=1-TOP_FRAC).reset_index(drop=True)

# Extended trainset, including the first part of testset.
train_plus = pd.concat([train, test_first])
test = test_first

4613258 50964


In [44]:
def gen_sequences(df):
    sequences = df.groupby('user').agg({'item': lambda x:list(x)})
    sequences.columns = ['item_sequence']
    return sequences.reset_index()

# Gen sequences for train and test
train = gen_sequences(train)
test  = gen_sequences(test)

# Remaining part for evaluation.
test_second = gen_sequences(test_second)
test['eval_sequence'] = test_second['item_sequence']
del test_second

test.head()

Unnamed: 0,user,item_sequence,eval_sequence
0,97,"[2196, 445, 1, 328, 10283, 238, 1014, 4340, 51...","[15514, 16614, 16067, 17976, 1432, 1127, 9151,..."
1,112,"[291, 492, 1962, 1014, 2196, 1692, 328, 49, 54...","[879, 9872, 9605, 1086, 15959, 932, 1769, 1044..."
2,127,"[291, 737, 238, 1014, 1038, 1012, 545, 1030, 1...","[19051, 7043, 8466, 2251]"
3,192,"[2196, 1085, 328, 1, 545, 291, 1016, 1106, 434...","[1692, 11117, 10652]"
4,195,"[737, 492, 1038, 1012, 291, 5241, 779, 2196, 1...","[791, 681, 558, 1017, 103]"
