In [None]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict
from datetime import datetime, timedelta

### download dataset

In [None]:
%%bash
wget http://files.grouplens.org/datasets/movielens/ml-20m.zip
unzip ml-20m.zip

In [None]:
rating_df =  pd.read_csv('./ml-20m/ratings.csv')
rating_df["timestamp"] = pd.to_datetime(rating_df['timestamp'], unit='s')

### truncated dateset  

In [None]:
def truncate_dataset(year, month):
    truncated_df = rating_df[rating_df["timestamp"] > datetime(year, month, 1)]
    truncated_uni_movie_id = truncated_df.movieId.unique()
    truncated_uni_user_id = truncated_df.userId.unique()
    
    # user_id mapping 
    truncated_user_id = {}
    max_user_len = 0
    for m_id in truncated_uni_user_id:
        truncated_user_id[m_id] = max_user_len
        max_user_len += 1
    
    # item_id mapping 
    truncated_movie_id = {}
    max_movie_len = 0
    for m_id in truncated_uni_movie_id:
        truncated_movie_id[m_id] = max_movie_len
        max_movie_len += 1

    print("max_user:", max_user_len, "max_item:", max_movie_len)
    truncated_df = truncated_df.sort_values(by="timestamp", ascending=False) # sort by timestamp in descending order
    truncated_df["movieId"] = truncated_df["movieId"].apply(lambda x: truncated_movie_id[x])
    truncated_df["userId"] = truncated_df["userId"].apply(lambda x: truncated_user_id[x])
    
    df = truncated_df[["userId", "movieId", "timestamp"]]
    a = np.array([tuple(i) for i in df.values], dtype=np.dtype([('user_id', '<i4'), ('item_id', '<i4'), ('timestamp', 'datetime64[D]')]))
    np.save('../dataset/user_data_truncated_{}_{}.npy'.format(year, month), a)

### generate training records starting from [2010-01-01, 2014-01-01], with time intervals every 6 month

In [None]:
os.mkdir("../dataset/")
truncate_dataset(2014, 1)
for year in [2013, 2012, 2011, 2010]:
    for month in [7,1]:
        truncate_dataset(year, month)

### generate test dates

In [None]:
test_date = datetime(2015,1,1).date()
max_date = datetime(2015,3,31).date()
os.mkdir("../configs/") 
with open('../configs/test_dates.txt', 'w') as f:
    while test_date + timedelta(days=7) < max_date:
        f.write("%s\n" % test_date)
        test_date = test_date + timedelta(days=7)

Note: by the end of this notebook, you should have a folder named "dataset" and "configs" under your project folder.