In [55]:
# Generate dataset: Preprocess the raw dataset to the desired format
# training example: user_vocab_id, movie_vocab_id, category_vocab_id(1,...,n) | ratings

In [16]:
import os
import shutil
import csv
import numpy as np
import collections
import random

In [17]:
# step 1: 
#-> generate (category, category_vocab_id) pairs
#-> generate (movie_id, category_vocab_id) pairs
#-> generate (movie_id, movie_vocab_id) pairs
#Also, copy users.dat and movies.dat (added movie_vocab_id) to ../data/ 
category_to_category_vocab_id = {}
movie_id_to_movie_vocab_id = {}
movie_id_to_category_vocab_id = collections.defaultdict(list)
category_to_category_vocab_id_cnt = 0
movie_id_to_movie_vocab_id_cnt = 0

##########
# Note that we change the format of movies.dat by adding movie_vocab_id info.
# Nomarlly we dont change raw data
##########
open('../data/movies.dat', 'w').close()
fout = open('../data/movies.dat', 'w')

with open('../raw_data/movies.dat', 'r') as file:
    rows = csv.reader(file, delimiter='^')
    for idx, row in enumerate(rows):
        movie_id, movie_name, categories = row
        if movie_id not in movie_id_to_movie_vocab_id:
            movie_id_to_movie_vocab_id_cnt += 1
            movie_id_to_movie_vocab_id[movie_id] = movie_id_to_movie_vocab_id_cnt
        categories = categories.split("|")
        for category in categories:
            if category not in category_to_category_vocab_id and category != "":
                category_to_category_vocab_id_cnt += 1 
                category_to_category_vocab_id[category] = category_to_category_vocab_id_cnt
            if category != "":
                movie_id_to_category_vocab_id[movie_id].append(category_to_category_vocab_id[category])
        new_row = [str(movie_id_to_movie_vocab_id[movie_id])] + row       
        fout.write('^'.join(new_row) + '\n')
fout.close()
    # print(movie_id_to_category_vocab_id)   
    # print(category_to_category_vocab_id)
    # print(movie_id_to_movie_vocab_id_cnt)
    
shutil.copyfile('../raw_data/users.dat', '../data/users.dat')     

'../data/users.dat'

In [18]:
# step 2: generate examples, the examples are sorted by timestamp
#################################################################
# user_vocab_id : movie_vocab_id : category_vocab_ids : label : timestamp
#################################################################
examples = []
with open('../raw_data/ratings.dat', 'r') as file:
    rows = csv.reader(file, delimiter='^')
    total_n_examples = 0
    for idx, row in enumerate(rows):
        user_vocab_id, movie_id, rating, timestamp = row
        movie_vocab_id = movie_id_to_movie_vocab_id[movie_id]
        category_vocab_ids = movie_id_to_category_vocab_id[movie_id]
        new_row = [user_vocab_id, str(movie_vocab_id)]
        for category_vocab_id in category_vocab_ids:
            new_row.append(str(category_vocab_id))
        new_row.append(rating)
        new_row.append(timestamp)
        examples.append(new_row)
        total_n_examples += 1

# sort the examples by timestamp.
examples.sort(key = lambda x: int(x[-1]))

#write examples to data.dat
open('../data/data.dat', 'w').close()
with open("../data/data.dat", "w") as outF:
    for example in examples:
        new_row = ":".join(example)
        outF.write(new_row + "\n")


In [19]:
# step 3: write metrics to metadata.dat
######metadata.dat########
# num_user:69280
# num_movie:36365
# num_category:28
# user1:cnt1
# user2:cnt2
# user3:cnt3
# ...
######metadata.dat########
open('../data/meta_data.dat', 'w').close()

num_user = 0
num_movie = len(movie_id_to_category_vocab_id)
num_category = len(category_to_category_vocab_id)

with open('../raw_data/users.dat', newline='') as file:
    rows = csv.reader(file, delimiter='^')
    for idx, row in enumerate(rows):
        user_id, user_vocab_id = row
        num_user = user_id

per_user_cnt = {}
with open("../data/data.dat", "r") as file:
    rows = csv.reader(file, delimiter=':', quotechar='|')
    for idx, row in enumerate(rows):
        user = int(row[0])
        per_user_cnt[user] = per_user_cnt.get(user, 0) + 1

keys = ['num_user', 'num_movie', 'num_category']
values = [str(num_user), str(num_movie), str(num_category)]
with open("../data/meta_data.dat", "w") as file:
    for i in range(3):
        file.write(keys[i] + ":" + values[i] + "\n")
    for user, cnt in per_user_cnt.items():
        file.write(str(user) + ":" + str(cnt) + '\n')

In [20]:
# step 4: split data into training data and testing data via timestamp
# (all test data has larger timestamp than training data, to avoid data leakage.)

split_ratio = 0.8
per_user_cnt = per_user_cnt
train_per_user_cnt = {user : int(split_ratio * cnt) for user, cnt in per_user_cnt.items()}
iter_per_user_cnt = {user : 0 for user, cnt in per_user_cnt.items()}

open('../data/train_data.dat', 'w').close()
open('../data/test_data.dat', 'w').close()

fout_train = open('../data/train_data.dat', 'w')
fout_test = open('../data/test_data.dat', 'w')
fin = open('../data/data.dat', 'r')
   
for row in fin:
    user = int(row.split(":")[0])
    cur_cnt = iter_per_user_cnt[user]
    train_cnt = train_per_user_cnt[user]
    if cur_cnt < train_cnt:
        fout_train.write(row)
    else:
        fout_test.write(row)
    iter_per_user_cnt[user] = cur_cnt + 1
fin.close()

fout_train.close()
fout_test.close()


In [21]:
####--- MUST RUN TOGETHER WITH STEP 3 ---####
#step 5: clean the test_data.dat
# remove the ids in test examples that never appears in train dataset.

user_vocab_id_set, movie_vocab_id_set = set(), set()
with open('../data/train_data.dat', 'r') as train_file:
    rows = csv.reader(train_file, delimiter=':')
    for row in rows:
        user_vocab_id_set.add(row[0])
        movie_vocab_id_set.add(row[1])

open('../data/test_data_temp.dat', 'w').close()
fout_test = open('../data/test_data_temp.dat', 'w')

skip_cnt = 0
test_cnt = 0
with open('../data/test_data.dat', 'r') as test_file:
    rows = csv.reader(test_file, delimiter=':')
    for row in rows:
        user_vocab_id = row[0]
        movie_vocab_id = row[1]
        test_cnt += 1
        if user_vocab_id not in user_vocab_id_set or movie_vocab_id not in movie_vocab_id_set:
            skip_cnt += 1
            continue
        new_row = ":".join(row)
        fout_test.write(new_row + '\n')
print(test_cnt, skip_cnt)

os.remove("../data/test_data.dat")
os.rename("../data/test_data_temp.dat", "../data/test_data.dat")

218299 34253
