In [7]:
import math
import os
import random
import pickle
import argparse
from collections import deque
import time
from datetime import timedelta
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import IterableDataset, DataLoader, get_worker_info
import matplotlib.pyplot as plt


class GetTriplePair(IterableDataset):
    # for ml-1m we load in 3760 item 6040 user and 994169 train pair
    def __init__(self, item_size, user_list, pair, shuffle, num_epochs):
        self.item_size = item_size
        self.user_list = user_list
        self.pair = pair
        self.shuffle = shuffle
        self.num_epochs = num_epochs

    def __iter__(self):
        self.example_size = self.num_epochs * len(self.pair)
        self.example_index_queue = deque([])
        self.seed = 0
        self.start_list_index = None
        self.num_workers = 1
        self.index = 0
        return self

    def __next__(self):
        if self.index >= self.example_size:
            raise StopIteration
        # If `example_index_queue` is used up, replenish this list.
        while len(self.example_index_queue) == 0:
            index_list = list(range(len(self.pair)))
            if self.shuffle:
                random.Random(self.seed).shuffle(index_list)
                self.seed += 1
            if self.start_list_index is not None:
                index_list = index_list[self.start_list_index::self.num_workers]

                # Calculate next start index
                self.start_list_index = (self.start_list_index + (self.num_workers - (len(self.pair) % self.num_workers))) % self.num_workers
            self.example_index_queue.extend(index_list)
        result = self._example(self.example_index_queue.popleft())
        self.index += self.num_workers
        return result

    def _example(self, idx):
        # in a train pair, format = (u,i), j = a random item which does not exist in user u's list of items
        u = self.pair[idx][0]
        i = self.pair[idx][1]
        j = np.random.randint(self.item_size)
        while j in self.user_list[u]:
            j = np.random.randint(self.item_size)
        return u, i, j

In [13]:
class DatasetLoader(object):
    def load(self):
        """Minimum condition for dataset:
          * All users must have at least one item record.
          * All items must have at least one user record.
        """
        raise NotImplementedError


class MovieLens1M(DatasetLoader):
    def __init__(self, data_dir):
        self.train_fpath = os.path.join(data_dir, 'train_df.csv')
        self.test_fpath = os.path.join(data_dir, 'test_df.csv')

    def load(self):
        # Load data
        train_df = pd.read_csv(self.train_fpath,
                         sep=',',
                         engine='python',
                         names=['user', 'item', 'rate', 'time','gender','age']).reset_index(drop=True)
        # TODO: Remove negative rating?
        # df = df[df['rate'] >= 3]
        test_df = pd.read_csv(self.test_fpath,
                         sep=',',
                         engine='python',
                         names=['user', 'item', 'rate', 'time','gender','age']).reset_index(drop=True)
        # TODO: Remove negative rating?
        # df = df[df['rate'] >= 3]
        return train_df, test_df

In [14]:
def create_user_list(df, user_size):
    user_list = [list() for u in range(user_size)]
    for row in df.itertuples():
        user_list[row.user].append((row.time, row.item))
    return user_list

In [15]:
def create_pair(user_list):
    pair = []
    for user, item_list in enumerate(user_list):
        pair.extend([(user, item) for item in item_list])
    return pair

In [16]:
train_df, test_df = MovieLens1M('Data').load()

In [17]:
train_df

Unnamed: 0,user,item,rate,time,gender,age
0,2847,1120,3,1028961393,M,18
1,3887,471,4,965807659,M,18
2,5430,1821,5,960073693,F,45
3,5645,861,5,958882291,M,25
4,3110,541,5,969483887,M,25
...,...,...,...,...,...,...
796388,1556,1391,3,974996656,M,18
796389,307,36,5,976485688,M,50
796390,1941,3256,5,974852150,M,35
796391,2902,3260,3,971857926,M,25


In [18]:
all_df = pd.concat([train_df, test_df]).drop_duplicates()

In [47]:
all_df['user'] = all_df['user']-1
train_df['user'] = train_df['user']-1
test_df['user'] = test_df['user']-1

In [48]:
train_user_size = len(train_df['user'].unique())
test_user_size = len(test_df['user'].unique())
user_size = len(all_df['user'].unique())
item_size = len(all_df['item'].unique())

In [49]:
item_size

3043

In [50]:
train_user_list = create_user_list(train_df, train_user_size)
test_user_list = create_user_list(test_df, test_user_size)
test_user_list = [list(map(lambda x: x[1], l)) for l in test_user_list]
train_user_list = [list(map(lambda x: x[1], l)) for l in train_user_list]

In [51]:
train_pair = create_pair(train_user_list)


In [52]:
train_pair

[(0, 3107),
 (0, 1103),
 (0, 3035),
 (0, 593),
 (0, 3809),
 (0, 1084),
 (0, 1408),
 (0, 368),
 (0, 1385),
 (0, 1955),
 (0, 3578),
 (0, 1198),
 (0, 589),
 (0, 1357),
 (0, 2268),
 (0, 3108),
 (0, 647),
 (0, 2427),
 (0, 780),
 (0, 3147),
 (0, 1610),
 (0, 1259),
 (0, 2002),
 (0, 1954),
 (0, 3418),
 (0, 2396),
 (0, 1196),
 (0, 265),
 (0, 457),
 (0, 1962),
 (0, 2194),
 (0, 2278),
 (0, 1207),
 (0, 1945),
 (0, 3451),
 (0, 498),
 (0, 3654),
 (0, 2126),
 (0, 1784),
 (0, 459),
 (0, 3735),
 (0, 982),
 (0, 2943),
 (0, 515),
 (0, 1244),
 (0, 3105),
 (0, 2728),
 (0, 349),
 (0, 21),
 (0, 163),
 (0, 95),
 (0, 318),
 (0, 590),
 (0, 2359),
 (0, 3678),
 (0, 2321),
 (0, 648),
 (0, 1552),
 (0, 1253),
 (0, 3699),
 (0, 165),
 (0, 1265),
 (0, 2858),
 (0, 3255),
 (0, 3334),
 (0, 736),
 (0, 1687),
 (0, 1690),
 (0, 3030),
 (0, 2353),
 (0, 3257),
 (0, 3071),
 (0, 1801),
 (0, 1953),
 (0, 434),
 (0, 1090),
 (0, 1537),
 (0, 3893),
 (0, 1372),
 (0, 2916),
 (0, 3471),
 (0, 1124),
 (0, 2881),
 (0, 1213),
 (0, 2852),
 (0