# Data Pre-processing

Read dataset (.csv) from MovieLens and create pre-processed dataset.

In [None]:
import os
import shutil
import sys

import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn
sn.set()

import pandas as pd

import tensorflow as tf
from tensorflow.contrib.layers import apply_regularization, l2_regularizer

import bottleneck as bn

In [None]:
### movielens dataset이 있는 곳
DATA_DIR = './dataset/25m/'
OUTPUT_DIR = os.path.join(DATA_DIR, 'output')

raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), header=0)

# rating이 4 이상인 데이터만 남김
raw_data = raw_data[raw_data['rating'] > 3.5]
raw_data.head()

## `get_count`

테이블의 특정 key를 기준으로 해당 값을 가진 row 갯수를 출력합니다

- `tp`: 테이블 (.csv를 panda로 읽은 결과물)
- `key`: 테이블의 key (예: `userId`, `movieId`, `rating`, ...)

In [None]:
def get_count(tp, key):
    playcount_groupbyid = tp[[key]].groupby(key, as_index=False)
    count = playcount_groupbyid.size()
    return count

## `filter_triplets`

평가가 너무 적은 User 또는 평가를 너무 적게 받은 Item은 Data에서 삭제합니다. (신뢰성이 떨어지는 데이터 삭제)

- `data`: 테이블 (.csv를 panda로 읽은 결과물)
- `min_uc`: 최소 유저 수
- `min_sc`: 최소 아이템 평가 수

In [None]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

In [None]:
# 여기서 raw_data를 한번 더 덮어 씀
raw_data, user_activity, item_popularity = filter_triplets(raw_data)

## `sparsity`

실제 평가된 Row 수 / 가능한 전체 평가 수

> 가능한 전체 평가 수는 User 수 * Item 수

In [None]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

## `shuffle`

랜덤하게 Array를 섞음

In [None]:
def shuffle(list):
    np.random.seed(29581)
    idx_perm = np.random.permutation(len(list))
    list = list[idx_perm]
    return list

unique_uid = shuffle(user_activity.index)

### user를 몇개 단위로 분리하는데 여기 약간 잘 모르겠음.

- tr_users: train?
- vd_users: validation?
- te_users: test?

In [None]:
# create train/validation/test users
n_users = unique_uid.size
# 요거 20M 기준 10000개 기준이길래 똑같이 0.07 곱해서 small 기준 42개로 설정
n_heldout_users = (n_users // 100) * 7

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

print('tr_users: {}%'.format((n_users - n_heldout_users * 2) / n_users * 100))
print('vd_users: {}%'.format((n_heldout_users / n_users) * 100))
print('te_users: {}%'.format((n_heldout_users / n_users) * 100))

In [None]:
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]
train_plays.head()

## `show2id`
- sid를 넣으면 unique_sid의 index가 나온다

## `user2id`
- uid를 넣으면 unique_uid의 index가 나온다

In [None]:
unique_sid = pd.unique(train_plays['movieId'])

show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [None]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

with open(os.path.join(OUTPUT_DIR, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

with open(os.path.join(OUTPUT_DIR, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)

## `split_train_test_proportion`

테이블 중에 랜덤으로 20% 골라내서 data_te, 나머지는 data_tr로 return

In [None]:
def split_train_test_proportion(tp, test_prop=0.2):
    tps_groupby_uid = tp.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, _tp) in enumerate(tps_groupby_uid):
        n_items_u = len(_tp)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(_tp[np.logical_not(idx)])
            te_list.append(_tp[idx])
        else:
            tr_list.append(_tp)

        # 아무것도 안뜨길래 여기 고쳐서 log 찍는 빈도 수정
        if i % 4 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [None]:
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

In [None]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

In [None]:
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

In [None]:
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

In [None]:
def numerize(tp):
    uid = map(lambda x: user2id[x], tp['userId'])
    sid = map(lambda x: show2id[x], tp['movieId'])

    # 여기서 오류 해결하기 위해 list 붙임
    return pd.DataFrame(data={'uid': list(uid), 'sid': list(sid)}, columns=['uid', 'sid'])

In [None]:
train_data = numerize(train_plays)
train_data.to_csv(os.path.join(OUTPUT_DIR, 'train.csv'), index=False)

In [None]:
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(OUTPUT_DIR, 'validation_tr.csv'), index=False)

In [None]:
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(OUTPUT_DIR, 'validation_te.csv'), index=False)

In [None]:
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(OUTPUT_DIR, 'test_tr.csv'), index=False)

In [None]:
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(OUTPUT_DIR, 'test_te.csv'), index=False)