Skip to content

Commit

Permalink
adding Recommendation model
Browse files Browse the repository at this point in the history
  • Loading branch information
izaakniksan committed Nov 13, 2018
1 parent c8bfa39 commit 9a75513
Show file tree
Hide file tree
Showing 13 changed files with 870 additions and 0 deletions.
14 changes: 14 additions & 0 deletions Recommendation-NCF/PyTorch/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#data:
ml-20m.zip
*/ml-20m/*
*.csv
#IDE folder:
*/.idea/*
__pycache__
#nvprof output
*my_nvpof_output_*
#run
*/run/*
#nvprof
nvprof_data/*
!nvprof_data/.gitkeep
16 changes: 16 additions & 0 deletions Recommendation-NCF/PyTorch/dataset/download_dataset.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
function download_20m {
echo "Download ml-20m"
curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip
}

function download_1m {
echo "Downloading ml-1m"
curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip
}

if [[ $1 == "ml-1m" ]]
then
download_1m
else
download_20m
fi
44 changes: 44 additions & 0 deletions Recommendation-NCF/PyTorch/dataset/verify_dataset.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
function get_checker {
if [[ "$OSTYPE" == "darwin"* ]]; then
checkmd5=md5
else
checkmd5=md5sum
fi

echo $checkmd5
}


function verify_1m {
# From: curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip.md5
hash=<(echo "MD5 (ml-1m.zip) = c4d9eecfca2ab87c1945afe126590906")
local checkmd5=$(get_checker)
if diff <($checkmd5 ml-1m.zip) $hash &> /dev/null
then
echo "PASSED"
else
echo "FAILED"
fi
}

function verify_20m {
# From: curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip.md5
hash=<(echo "MD5 (ml-20m.zip) = cd245b17a1ae2cc31bb14903e1204af3")
local checkmd5=$(get_checker)

if diff <($checkmd5 ml-20m.zip) $hash &> /dev/null
then
echo "PASSED"
else
echo "FAILED"
fi

}


if [[ $1 == "ml-1m" ]]
then
verify_1m
else
verify_20m
fi
1 change: 1 addition & 0 deletions Recommendation-NCF/PyTorch/nvprof_data/.gitkeep
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Placeholder file to commit empty folder
84 changes: 84 additions & 0 deletions Recommendation-NCF/PyTorch/scripts/benchmark-ncf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash
# source scripts/benchmark-ncf.sh # Train
# source scripts/benchmark-ncf.sh --profile # Profile for compute utilization
# source scripts/benchmark-ncf.sh --profile-fp32 # Profile for fp32 utilization

# Using a seed of 1
seed=1
skip_data=0
THRESHOLD=0.635

if [ $# -eq 0 ]
then
echo "nvprof disabled"
COMMAND="python ./source/ncf.py ml-20m -l 0.0005 -b 2048 --layers 256 256 128 64 -f 64 --seed $seed \
--threshold $THRESHOLD --processes 1 --workers 0"

elif [ "$1" == "--profile" ]
then
echo "nvprof is profiling compute utilization"
COMMAND="nvprof --profile-from-start off --export-profile nvprof_data/compute_utilization%p.nvvp --print-summary \
python ./source/ncf.py ml-20m -l 0.0005 -b 2048 --layers 256 256 128 64 -f 64 --seed $seed \
--threshold $THRESHOLD --processes 1 --workers 0 --profile"

elif [ "$1" == "--profile-fp32" ]
then
echo "nvprof is profiling fp32 utilization"
COMMAND="nvprof --profile-from-start off --metrics single_precision_fu_utilization --export-profile \
nvprof_data/fp32_utilization%p.nvvp --print-summary python ./source/ncf.py ml-20m -l 0.0005 -b 2048 --layers \
256 256 128 64 -f 64 --seed $seed --threshold $THRESHOLD --processes 1 --workers 0 --profile"

else
echo "Invalid input argument. Valid ones are --profile --profile-fp32."
return -1
fi


# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"

echo "unzip ml-20m.zip"
if unzip ml-20m.zip
then
if [ $skip_data -eq 0 ]
then
echo "Start processing ml-20m/ratings.csv"
t0=$(date +%s)
python ./source/convert.py ml-20m/ratings.csv ml-20m --negatives 999
t1=$(date +%s)
delta=$(( $t1 - $t0 ))
echo "Finish processing ml-20m/ratings.csv in $delta seconds"
else
echo "Skipped data processing"
fi

echo "Start training"
t0=$(date +%s)
$COMMAND
t1=$(date +%s)
delta=$(( $t1 - $t0 ))
echo "Finish training in $delta seconds"

# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"


# report result
result=$(( $end - $start ))
result_name="recommendation"


echo "RESULT,$result_name,$seed,$result,$USER,$start_fmt"
else
echo "Problem unzipping ml-20.zip"
echo "Please run 'download_data.sh && verify_datset.sh' first"
fi





101 changes: 101 additions & 0 deletions Recommendation-NCF/PyTorch/source/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import os
from argparse import ArgumentParser
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

from load import implicit_load


MIN_RATINGS = 20


USER_COLUMN = 'user_id'
ITEM_COLUMN = 'item_id'


TRAIN_RATINGS_FILENAME = 'train-ratings.csv'
TEST_RATINGS_FILENAME = 'test-ratings.csv'
TEST_NEG_FILENAME = 'test-negative.csv'


def parse_args():
parser = ArgumentParser()
parser.add_argument('path', type=str,
help='Path to reviews CSV file from MovieLens')
parser.add_argument('output', type=str,
help='Output directory for train and test CSV files')
parser.add_argument('-n', '--negatives', type=int, default=999,
help='Number of negative samples for each positive'
'test example')
parser.add_argument('-s', '--seed', type=int, default=0,
help='Random seed to reproduce same negative samples')
return parser.parse_args()


def main():
args = parse_args()
np.random.seed(args.seed)

print("Loading raw data from {}".format(args.path))
df = implicit_load(args.path, sort=False)
print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby(USER_COLUMN)
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

print("Mapping original user and item IDs to new sequential IDs")
original_users = df[USER_COLUMN].unique()
original_items = df[ITEM_COLUMN].unique()

user_map = {user: index for index, user in enumerate(original_users)}
item_map = {item: index for index, item in enumerate(original_items)}

df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user])
df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item])

assert df[USER_COLUMN].max() == len(original_users) - 1
assert df[ITEM_COLUMN].max() == len(original_items) - 1

print("Creating list of items for each user")
# Need to sort before popping to get last item
df.sort_values(by='timestamp', inplace=True)
all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN]))
user_to_items = defaultdict(list)
for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)):
user_to_items[getattr(row, USER_COLUMN)].append(getattr(row, ITEM_COLUMN)) # noqa: E501

test_ratings = []
test_negs = []
all_items = set(range(len(original_items)))
print("Generating {} negative samples for each user"
.format(args.negatives))
for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)): # noqa: E501
test_item = user_to_items[user].pop()

all_ratings.remove((user, test_item))
all_negs = all_items - set(user_to_items[user])
all_negs = sorted(list(all_negs)) # determinism

test_ratings.append((user, test_item))
test_negs.append(list(np.random.choice(all_negs, args.negatives)))

print("Saving train and test CSV files to {}".format(args.output))
df_train_ratings = pd.DataFrame(list(all_ratings))
df_train_ratings['fake_rating'] = 1
df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME),
index=False, header=False, sep='\t')

df_test_ratings = pd.DataFrame(test_ratings)
df_test_ratings['fake_rating'] = 1
df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME),
index=False, header=False, sep='\t')

df_test_negs = pd.DataFrame(test_negs)
df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME),
index=False, header=False, sep='\t')


if __name__ == '__main__':
main()
57 changes: 57 additions & 0 deletions Recommendation-NCF/PyTorch/source/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import numpy as np
import scipy
import scipy.sparse
import torch
import torch.utils.data


class CFTrainDataset(torch.utils.data.dataset.Dataset):
def __init__(self, train_fname, nb_neg):
self._load_train_matrix(train_fname)
self.nb_neg = nb_neg

def _load_train_matrix(self, train_fname):
def process_line(line):
tmp = line.split('\t')
return [int(tmp[0]), int(tmp[1]), float(tmp[2]) > 0]
with open(train_fname, 'r') as file:
data = list(map(process_line, file))
self.nb_users = max(data, key=lambda x: x[0])[0] + 1
self.nb_items = max(data, key=lambda x: x[1])[1] + 1

self.data = list(filter(lambda x: x[2], data))
self.mat = scipy.sparse.dok_matrix(
(self.nb_users, self.nb_items), dtype=np.float32)
for user, item, _ in data:
self.mat[user, item] = 1.

def __len__(self):
return (self.nb_neg + 1) * len(self.data)

def __getitem__(self, idx):
if idx % (self.nb_neg + 1) == 0:
idx = idx // (self.nb_neg + 1)
return self.data[idx][0], self.data[idx][1], np.ones(1, dtype=np.float32) # noqa: E501
else:
idx = idx // (self.nb_neg + 1)
u = self.data[idx][0]
j = torch.LongTensor(1).random_(0, self.nb_items).item()
while (u, j) in self.mat:
j = torch.LongTensor(1).random_(0, self.nb_items).item()
return u, j, np.zeros(1, dtype=np.float32)


def load_test_ratings(fname):
def process_line(line):
tmp = map(int, line.split('\t')[0:2])
return list(tmp)
ratings = map(process_line, open(fname, 'r'))
return list(ratings)


def load_test_negs(fname):
def process_line(line):
tmp = map(int, line.split('\t'))
return list(tmp)
negs = map(process_line, open(fname, 'r'))
return list(negs)
68 changes: 68 additions & 0 deletions Recommendation-NCF/PyTorch/source/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from collections import namedtuple

import pandas as pd


RatingData = namedtuple('RatingData',
['items', 'users', 'ratings', 'min_date', 'max_date'])


def describe_ratings(ratings):
info = RatingData(items=len(ratings['item_id'].unique()),
users=len(ratings['user_id'].unique()),
ratings=len(ratings),
min_date=ratings['timestamp'].min(),
max_date=ratings['timestamp'].max())
print("{ratings} ratings on {items} items from {users} users"
" from {min_date} to {max_date}"
.format(**(info._asdict())))
return info


def process_movielens(ratings, sort=True):
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
if sort:
ratings.sort_values(by='timestamp', inplace=True)
describe_ratings(ratings)
return ratings


def load_ml_100k(filename, sort=True):
names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(filename, sep='\t', names=names)
return process_movielens(ratings, sort=sort)


def load_ml_1m(filename, sort=True):
names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(filename, sep='::', names=names, engine='python')
return process_movielens(ratings, sort=sort)


def load_ml_10m(filename, sort=True):
names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(filename, sep='::', names=names, engine='python')
return process_movielens(ratings, sort=sort)


def load_ml_20m(filename, sort=True):
ratings = pd.read_csv(filename, dtype={'userId': int, 'movieId': int, 'timestamp': int})
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
names = {'userId': 'user_id', 'movieId': 'item_id'}
ratings.rename(columns=names, inplace=True)
return process_movielens(ratings, sort=sort)


DATASETS = [k.replace('load_', '') for k in locals().keys() if "load_" in k]


def get_dataset_name(filename):
for dataset in DATASETS:
if dataset in filename.replace('-', '_').lower():
return dataset
raise NotImplementedError


def implicit_load(filename, sort=True):
func = globals()["load_" + get_dataset_name(filename)]
return func(filename, sort=sort)
Loading

0 comments on commit 9a75513

Please sign in to comment.