# Haddendataset preprocessing

In [66]:
import os
import shutil
import sys

import numpy as np
from scipy import sparse
import pandas as pd
import math
import matplotlib.pyplot as plt

### Load Data

In [67]:
### change `DATA_DIR` to the location where movielens-20m dataset sits
DATA_DIR = './'

In [172]:
min_user = 5
min_product = 100

In [173]:
raw_data_total = pd.read_csv(os.path.join(DATA_DIR, 'data.csv'), header=0)
print(raw_data_total.shape[0])
raw_data = pd.DataFrame({'count' : raw_data_total.groupby( [ "users", "products"] ).size()}).reset_index()
del raw_data['count']
print(raw_data)
print(raw_data.shape)

539995
        users  products
0           0         0
1           0      1358
2           0      5506
3           0     17999
4           1         1
...       ...       ...
377917  61188     16774
377918  61189      1067
377919  61190       593
377920  61191        24
377921  61191        31

[377922 rows x 2 columns]
(377922, 2)


In [174]:
raw_data.head()

Unnamed: 0,users,products
0,0,0
1,0,1358
2,0,5506
3,0,17999
4,1,1


In [175]:
raw_data.shape

(377922, 2)

In [176]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [177]:
def filter_triplets(tp, min_uc=0, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
         #기존 data를 min_sc 해당되는 movield만 filtering       
        itemcount = get_count(tp, 'products')
        tp = tp[tp['products'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'users')
        #기존 data를 min_uc이상 해당되는 userld만 filtering
        tp = tp[tp['users'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'users'), get_count(tp, 'products') 
    return tp, usercount, itemcount

In [178]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_user, min_product)

In [179]:
unique_uid = user_activity.index
idx_permperm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_permperm]
print(unique_uid)

Int64Index([ 6837, 10225,  2707,  9058, 48499,  1947,  6701, 31282, 16809,
             5390,
            ...
             3412,  1014, 13571, 12075,  8271, 28495, 20021,  7018,  6218,
               16],
           dtype='int64', name='users', length=5150)


In [180]:
unique_sid = pd.unique(raw_data['products'])

In [181]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [182]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['users']))
    sid = list(map(lambda x: show2id[x], tp['products']))
    print(uid[0])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [183]:
pro_dir = "./"

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid_test.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [184]:
def getSparsity(data):
    usercount = get_count(data,'users')
    itemcount = get_count(data, 'products')
    sparsity = 1. * data.shape[0] / (usercount.shape[0] * len(unique_sid))
    print("After filtering, there are %d watching events from %d users and %d products (sparsity: %.3f%%)" % 
      (data.shape[0], usercount.shape[0], len(unique_sid), sparsity * 100))   

In [185]:
getSparsity(raw_data)

After filtering, there are 88386 watching events from 5150 users and 527 products (sparsity: 3.257%)


In [186]:
train_data = numerize(raw_data)
train_data.to_csv(os.path.join(pro_dir, 'train_test_h.csv'), index=False)

5018
