# Section 1: interact_status

In [6]:
import pandas as pd
import numpy as np
import random

data = [[1,11,2,13142],[1,12,5,24132],[2,21,3,35123],[2,22,4,22121],[2,23,1,23111],[3,31,2,11312],[3,32,1,13412]]
df = pd.DataFrame(data)
df.columns = ['userId', 'movieId', 'rating', 'timestamp']
print("[Data]")
print(df)
print('\n')
item_pool = set(df['movieId'].unique())

#NCF-PT_ejlee
interact_status = df.groupby('userId')['movieId'].apply(set).reset_index().rename (columns = {'movieId': 'interacted_items'})
print("[Interact_status: A]")
print(interact_status)
print('\n')
print("[Interact_status: B]")
interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: item_pool - x)
print(interact_status)
print('\n')
print("[Interact_status: C]")
interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 3))
print(interact_status)


[Data]
   userId  movieId  rating  timestamp
0       1       11       2      13142
1       1       12       5      24132
2       2       21       3      35123
3       2       22       4      22121
4       2       23       1      23111
5       3       31       2      11312
6       3       32       1      13412


[Interact_status: A]
   userId interacted_items
0       1         {11, 12}
1       2     {21, 22, 23}
2       3         {32, 31}


[Interact_status: B]
   userId interacted_items        negative_items
0       1         {11, 12}  {32, 21, 22, 23, 31}
1       2     {21, 22, 23}      {32, 11, 12, 31}
2       3         {32, 31}  {11, 12, 21, 22, 23}


[Interact_status: C]
   userId interacted_items        negative_items negative_samples
0       1         {11, 12}  {32, 21, 22, 23, 31}     [31, 23, 32]
1       2     {21, 22, 23}      {32, 11, 12, 31}     [32, 31, 12]
2       3         {32, 31}  {11, 12, 21, 22, 23}     [23, 12, 21]


interated_items: 시청한 영화리스트

negative_items: 시청하지 않은 영화리스트

실제는 99개의 샘플링을 하지만, 여기서는 예시 데이터가 적기 때문에 3개만 샘플링 -> negative_samples


# Section 2: train


In [7]:
df['rank_latest'] = df.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
print(df)
print('\n')
print("[Train Data]")
train = df[df['rank_latest']>1]
print(train)

   userId  movieId  rating  timestamp  rank_latest
0       1       11       2      13142          2.0
1       1       12       5      24132          1.0
2       2       21       3      35123          1.0
3       2       22       4      22121          3.0
4       2       23       1      23111          2.0
5       3       31       2      11312          2.0
6       3       32       1      13412          1.0


[Train Data]
   userId  movieId  rating  timestamp  rank_latest
0       1       11       2      13142          2.0
3       2       22       4      22121          3.0
4       2       23       1      23111          2.0
5       3       31       2      11312          2.0


1. userId로 group화한 후 timestamp 기준으로 랭킹

2. rank_latest 값이 1보다 크면 train 데이터로 분류 (else, test data)
> 이 과정을 생략

# Section 3: train_rating

In [8]:
train_rating = pd.merge(train,interact_status[['userId','negative_items']], on='userId')
print("[Train + Interact_status]")
print(train_rating)

[Train + Interact_status]
   userId  movieId  rating  timestamp  rank_latest        negative_items
0       1       11       2      13142          2.0  {32, 21, 22, 23, 31}
1       2       22       4      22121          3.0      {32, 11, 12, 31}
2       2       23       1      23111          2.0      {32, 11, 12, 31}
3       3       31       2      11312          2.0  {11, 12, 21, 22, 23}


`train data`와 `interact_status`를 합침 = `train_rating`
> 이 과정도 생략. 

# Section 4: Sampling (num_negatives)

In [5]:
train_rating['negatives'] = train_rating['negative_items'].apply(lambda x: random.sample(x, 4))
print(train_rating)

   userId  movieId  rating  ...  rank_latest        negative_items         negatives
0       1       11       2  ...          2.0  {32, 21, 22, 23, 31}  [32, 21, 22, 23]
1       2       22       4  ...          3.0      {32, 11, 12, 31}  [31, 12, 32, 11]
2       2       23       1  ...          2.0      {32, 11, 12, 31}  [12, 32, 11, 31]
3       3       31       2  ...          2.0  {11, 12, 21, 22, 23}  [23, 22, 12, 21]

[4 rows x 7 columns]


train_rating 에서 num_negative(here, 4)만큼 negative sample 뽑음