# Loss

In [68]:
# https://github.com/Parasgr7/Movie-Recommendation-System
# AutoEncoders

import numpy as np
import pandas as pd
import random

In [51]:
# UserID::Gender::Age::Occupation::Zip-code
# MovieID::Title::Genres
# UserID::MovieID::Rating::Timestamp (5-star scale)

# Importing the dataset
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [52]:
num_users = len(users)
num_movies= len(movies)

In [43]:
train_lst = []
val_lst   = []
test_lst  = []

In [44]:
for uid in range(num_users):
    watches = ratings.loc[ratings[0] == uid]
    
    train_lst.append(watches.iloc[:int(len(watches)*0.7)])
    val_lst.append(watches.iloc[int(len(watches)*0.7):int(len(watches)*0.8)])
    test_lst.append(watches.iloc[int(len(watches)*0.8):])

In [45]:
train = pd.concat(train_lst)
val   = pd.concat(val_lst)
test  = pd.concat(test_lst)
train.to_pickle('./data/ml/train.pkl')
val.to_pickle('./data/ml/val.pkl')
test.to_pickle('./data/ml/test.pkl')

In [47]:
num_users, num_movies

(6040, 3883)

In [58]:
num_users  = int(max(max(train.values[:,0]), max(val.values[:,0]), max(test.values[:,0]))) + 1
num_movies = int(max(max(train.values[:,1]), max(val.values[:,1]), max(test.values[:,1]))) + 1
num_users, num_movies

(6040, 3953)

# Hit ratio / NDCG

In [91]:
train_lst = []
val_lst   = []
test_lst  = []
neg_lst  = []

In [92]:
for uid in range(1, num_users+1):
    watches = ratings.loc[ratings[0] == uid]
    
    watched = watches[1].values.tolist()
    unwatch = set(range(1, num_movies+1)) - set(watched)
    
    ns_list = random.sample(unwatch, 100)
    
    train_lst.append(watches.iloc[:-2])
    val_lst.append(watches.iloc[-2])
    test_lst.append(watches.iloc[-1])
    neg_lst.append(list(ns_list))

In [93]:
train = pd.concat(train_lst)
val   = pd.concat(val_lst, 1).T
test  = pd.concat(test_lst, 1).T

train.to_pickle('./data/ml/train_score.pkl')
val.to_pickle('./data/ml/val_score.pkl')
test.to_pickle('./data/ml/test_score.pkl')
np.save('./data/ml/neg_score.npy', neg_lst)

# Yelp

In [44]:
import pandas as pd
col_names = ['user', 'item', 'rate', 'timestamp']
ratings = pd.read_csv('./yelp/yelp.rating', sep='\t', header=None, names = col_names)
num_item = 25815
num_user = 25677
less_user = []
less_item = []

In [45]:
flag = True
while flag:
    flag = False
    for user, count in ratings.groupby(['user']):
        if len(count) < 10:
            less_user.append(user)
            ratings = ratings[ratings.user != user]
            flag = True

    for item, count in ratings.groupby(['item']):
        if len(count) < 10:
            less_item.append(item)
            ratings = ratings[ratings.item != item]
            flag = True

In [46]:
len(less_user), len(less_item)

(2608, 10239)

In [60]:
train_lst = []
val_lst   = []
test_lst  = []
neg_lst  = []

In [61]:
import random

for uid in range(1, num_user+1):
    try:
        watches = ratings.loc[ratings['user'] == uid]
    except:
        continue
    
    watched = watches['item'].values.tolist()
    unwatch = set(range(1, num_item+1)) - set(watched) - set(less_item)
    
    ns_list = random.sample(unwatch, 100)
    
    train_lst.append(watches.iloc[:int(len(watches)*0.7)])
    val_lst.append(watches.iloc[int(len(watches)*0.7):int(len(watches)*0.8)])
    test_lst.append(watches.iloc[int(len(watches)*0.8):])

    neg_lst.append(list(ns_list))

In [63]:
import numpy as np
train = pd.concat(train_lst)
val   = pd.concat(val_lst)
test  = pd.concat(test_lst)
train.to_pickle('./data/yelp/train.pkl')
val.to_pickle('./data/yelp/val.pkl')
test.to_pickle('./data/yelp/test.pkl')
np.save('./data/yelp/neg_score.npy', neg_lst)

In [64]:
train

Unnamed: 0,user,item,rate,timestamp
69,1,66,4.0,1389974400
70,1,67,3.0,1409673600
71,1,68,3.0,1406908800
72,1,69,3.0,1388764800
74,1,71,5.0,1389801600
75,1,72,4.0,1409673600
76,1,73,4.0,1372694400
77,1,73,2.0,1394121600
78,1,74,2.0,1389456000
79,1,75,4.0,1389456000
