-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c8bfa39
commit 9a75513
Showing
13 changed files
with
870 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#data: | ||
ml-20m.zip | ||
*/ml-20m/* | ||
*.csv | ||
#IDE folder: | ||
*/.idea/* | ||
__pycache__ | ||
#nvprof output | ||
*my_nvpof_output_* | ||
#run | ||
*/run/* | ||
#nvprof | ||
nvprof_data/* | ||
!nvprof_data/.gitkeep |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
function download_20m { | ||
echo "Download ml-20m" | ||
curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip | ||
} | ||
|
||
function download_1m { | ||
echo "Downloading ml-1m" | ||
curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip | ||
} | ||
|
||
if [[ $1 == "ml-1m" ]] | ||
then | ||
download_1m | ||
else | ||
download_20m | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
function get_checker { | ||
if [[ "$OSTYPE" == "darwin"* ]]; then | ||
checkmd5=md5 | ||
else | ||
checkmd5=md5sum | ||
fi | ||
|
||
echo $checkmd5 | ||
} | ||
|
||
|
||
function verify_1m { | ||
# From: curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip.md5 | ||
hash=<(echo "MD5 (ml-1m.zip) = c4d9eecfca2ab87c1945afe126590906") | ||
local checkmd5=$(get_checker) | ||
if diff <($checkmd5 ml-1m.zip) $hash &> /dev/null | ||
then | ||
echo "PASSED" | ||
else | ||
echo "FAILED" | ||
fi | ||
} | ||
|
||
function verify_20m { | ||
# From: curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip.md5 | ||
hash=<(echo "MD5 (ml-20m.zip) = cd245b17a1ae2cc31bb14903e1204af3") | ||
local checkmd5=$(get_checker) | ||
|
||
if diff <($checkmd5 ml-20m.zip) $hash &> /dev/null | ||
then | ||
echo "PASSED" | ||
else | ||
echo "FAILED" | ||
fi | ||
|
||
} | ||
|
||
|
||
if [[ $1 == "ml-1m" ]] | ||
then | ||
verify_1m | ||
else | ||
verify_20m | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Placeholder file to commit empty folder |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/bin/bash | ||
# source scripts/benchmark-ncf.sh # Train | ||
# source scripts/benchmark-ncf.sh --profile # Profile for compute utilization | ||
# source scripts/benchmark-ncf.sh --profile-fp32 # Profile for fp32 utilization | ||
|
||
# Using a seed of 1 | ||
seed=1 | ||
skip_data=0 | ||
THRESHOLD=0.635 | ||
|
||
if [ $# -eq 0 ] | ||
then | ||
echo "nvprof disabled" | ||
COMMAND="python ./source/ncf.py ml-20m -l 0.0005 -b 2048 --layers 256 256 128 64 -f 64 --seed $seed \ | ||
--threshold $THRESHOLD --processes 1 --workers 0" | ||
|
||
elif [ "$1" == "--profile" ] | ||
then | ||
echo "nvprof is profiling compute utilization" | ||
COMMAND="nvprof --profile-from-start off --export-profile nvprof_data/compute_utilization%p.nvvp --print-summary \ | ||
python ./source/ncf.py ml-20m -l 0.0005 -b 2048 --layers 256 256 128 64 -f 64 --seed $seed \ | ||
--threshold $THRESHOLD --processes 1 --workers 0 --profile" | ||
|
||
elif [ "$1" == "--profile-fp32" ] | ||
then | ||
echo "nvprof is profiling fp32 utilization" | ||
COMMAND="nvprof --profile-from-start off --metrics single_precision_fu_utilization --export-profile \ | ||
nvprof_data/fp32_utilization%p.nvvp --print-summary python ./source/ncf.py ml-20m -l 0.0005 -b 2048 --layers \ | ||
256 256 128 64 -f 64 --seed $seed --threshold $THRESHOLD --processes 1 --workers 0 --profile" | ||
|
||
else | ||
echo "Invalid input argument. Valid ones are --profile --profile-fp32." | ||
return -1 | ||
fi | ||
|
||
|
||
# start timing | ||
start=$(date +%s) | ||
start_fmt=$(date +%Y-%m-%d\ %r) | ||
echo "STARTING TIMING RUN AT $start_fmt" | ||
|
||
echo "unzip ml-20m.zip" | ||
if unzip ml-20m.zip | ||
then | ||
if [ $skip_data -eq 0 ] | ||
then | ||
echo "Start processing ml-20m/ratings.csv" | ||
t0=$(date +%s) | ||
python ./source/convert.py ml-20m/ratings.csv ml-20m --negatives 999 | ||
t1=$(date +%s) | ||
delta=$(( $t1 - $t0 )) | ||
echo "Finish processing ml-20m/ratings.csv in $delta seconds" | ||
else | ||
echo "Skipped data processing" | ||
fi | ||
|
||
echo "Start training" | ||
t0=$(date +%s) | ||
$COMMAND | ||
t1=$(date +%s) | ||
delta=$(( $t1 - $t0 )) | ||
echo "Finish training in $delta seconds" | ||
|
||
# end timing | ||
end=$(date +%s) | ||
end_fmt=$(date +%Y-%m-%d\ %r) | ||
echo "ENDING TIMING RUN AT $end_fmt" | ||
|
||
|
||
# report result | ||
result=$(( $end - $start )) | ||
result_name="recommendation" | ||
|
||
|
||
echo "RESULT,$result_name,$seed,$result,$USER,$start_fmt" | ||
else | ||
echo "Problem unzipping ml-20.zip" | ||
echo "Please run 'download_data.sh && verify_datset.sh' first" | ||
fi | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import os | ||
from argparse import ArgumentParser | ||
from collections import defaultdict | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
|
||
from load import implicit_load | ||
|
||
|
||
MIN_RATINGS = 20 | ||
|
||
|
||
USER_COLUMN = 'user_id' | ||
ITEM_COLUMN = 'item_id' | ||
|
||
|
||
TRAIN_RATINGS_FILENAME = 'train-ratings.csv' | ||
TEST_RATINGS_FILENAME = 'test-ratings.csv' | ||
TEST_NEG_FILENAME = 'test-negative.csv' | ||
|
||
|
||
def parse_args(): | ||
parser = ArgumentParser() | ||
parser.add_argument('path', type=str, | ||
help='Path to reviews CSV file from MovieLens') | ||
parser.add_argument('output', type=str, | ||
help='Output directory for train and test CSV files') | ||
parser.add_argument('-n', '--negatives', type=int, default=999, | ||
help='Number of negative samples for each positive' | ||
'test example') | ||
parser.add_argument('-s', '--seed', type=int, default=0, | ||
help='Random seed to reproduce same negative samples') | ||
return parser.parse_args() | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
np.random.seed(args.seed) | ||
|
||
print("Loading raw data from {}".format(args.path)) | ||
df = implicit_load(args.path, sort=False) | ||
print("Filtering out users with less than {} ratings".format(MIN_RATINGS)) | ||
grouped = df.groupby(USER_COLUMN) | ||
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) | ||
|
||
print("Mapping original user and item IDs to new sequential IDs") | ||
original_users = df[USER_COLUMN].unique() | ||
original_items = df[ITEM_COLUMN].unique() | ||
|
||
user_map = {user: index for index, user in enumerate(original_users)} | ||
item_map = {item: index for index, item in enumerate(original_items)} | ||
|
||
df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user]) | ||
df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item]) | ||
|
||
assert df[USER_COLUMN].max() == len(original_users) - 1 | ||
assert df[ITEM_COLUMN].max() == len(original_items) - 1 | ||
|
||
print("Creating list of items for each user") | ||
# Need to sort before popping to get last item | ||
df.sort_values(by='timestamp', inplace=True) | ||
all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN])) | ||
user_to_items = defaultdict(list) | ||
for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)): | ||
user_to_items[getattr(row, USER_COLUMN)].append(getattr(row, ITEM_COLUMN)) # noqa: E501 | ||
|
||
test_ratings = [] | ||
test_negs = [] | ||
all_items = set(range(len(original_items))) | ||
print("Generating {} negative samples for each user" | ||
.format(args.negatives)) | ||
for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)): # noqa: E501 | ||
test_item = user_to_items[user].pop() | ||
|
||
all_ratings.remove((user, test_item)) | ||
all_negs = all_items - set(user_to_items[user]) | ||
all_negs = sorted(list(all_negs)) # determinism | ||
|
||
test_ratings.append((user, test_item)) | ||
test_negs.append(list(np.random.choice(all_negs, args.negatives))) | ||
|
||
print("Saving train and test CSV files to {}".format(args.output)) | ||
df_train_ratings = pd.DataFrame(list(all_ratings)) | ||
df_train_ratings['fake_rating'] = 1 | ||
df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME), | ||
index=False, header=False, sep='\t') | ||
|
||
df_test_ratings = pd.DataFrame(test_ratings) | ||
df_test_ratings['fake_rating'] = 1 | ||
df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME), | ||
index=False, header=False, sep='\t') | ||
|
||
df_test_negs = pd.DataFrame(test_negs) | ||
df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME), | ||
index=False, header=False, sep='\t') | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import numpy as np | ||
import scipy | ||
import scipy.sparse | ||
import torch | ||
import torch.utils.data | ||
|
||
|
||
class CFTrainDataset(torch.utils.data.dataset.Dataset): | ||
def __init__(self, train_fname, nb_neg): | ||
self._load_train_matrix(train_fname) | ||
self.nb_neg = nb_neg | ||
|
||
def _load_train_matrix(self, train_fname): | ||
def process_line(line): | ||
tmp = line.split('\t') | ||
return [int(tmp[0]), int(tmp[1]), float(tmp[2]) > 0] | ||
with open(train_fname, 'r') as file: | ||
data = list(map(process_line, file)) | ||
self.nb_users = max(data, key=lambda x: x[0])[0] + 1 | ||
self.nb_items = max(data, key=lambda x: x[1])[1] + 1 | ||
|
||
self.data = list(filter(lambda x: x[2], data)) | ||
self.mat = scipy.sparse.dok_matrix( | ||
(self.nb_users, self.nb_items), dtype=np.float32) | ||
for user, item, _ in data: | ||
self.mat[user, item] = 1. | ||
|
||
def __len__(self): | ||
return (self.nb_neg + 1) * len(self.data) | ||
|
||
def __getitem__(self, idx): | ||
if idx % (self.nb_neg + 1) == 0: | ||
idx = idx // (self.nb_neg + 1) | ||
return self.data[idx][0], self.data[idx][1], np.ones(1, dtype=np.float32) # noqa: E501 | ||
else: | ||
idx = idx // (self.nb_neg + 1) | ||
u = self.data[idx][0] | ||
j = torch.LongTensor(1).random_(0, self.nb_items).item() | ||
while (u, j) in self.mat: | ||
j = torch.LongTensor(1).random_(0, self.nb_items).item() | ||
return u, j, np.zeros(1, dtype=np.float32) | ||
|
||
|
||
def load_test_ratings(fname): | ||
def process_line(line): | ||
tmp = map(int, line.split('\t')[0:2]) | ||
return list(tmp) | ||
ratings = map(process_line, open(fname, 'r')) | ||
return list(ratings) | ||
|
||
|
||
def load_test_negs(fname): | ||
def process_line(line): | ||
tmp = map(int, line.split('\t')) | ||
return list(tmp) | ||
negs = map(process_line, open(fname, 'r')) | ||
return list(negs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
from collections import namedtuple | ||
|
||
import pandas as pd | ||
|
||
|
||
RatingData = namedtuple('RatingData', | ||
['items', 'users', 'ratings', 'min_date', 'max_date']) | ||
|
||
|
||
def describe_ratings(ratings): | ||
info = RatingData(items=len(ratings['item_id'].unique()), | ||
users=len(ratings['user_id'].unique()), | ||
ratings=len(ratings), | ||
min_date=ratings['timestamp'].min(), | ||
max_date=ratings['timestamp'].max()) | ||
print("{ratings} ratings on {items} items from {users} users" | ||
" from {min_date} to {max_date}" | ||
.format(**(info._asdict()))) | ||
return info | ||
|
||
|
||
def process_movielens(ratings, sort=True): | ||
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s') | ||
if sort: | ||
ratings.sort_values(by='timestamp', inplace=True) | ||
describe_ratings(ratings) | ||
return ratings | ||
|
||
|
||
def load_ml_100k(filename, sort=True): | ||
names = ['user_id', 'item_id', 'rating', 'timestamp'] | ||
ratings = pd.read_csv(filename, sep='\t', names=names) | ||
return process_movielens(ratings, sort=sort) | ||
|
||
|
||
def load_ml_1m(filename, sort=True): | ||
names = ['user_id', 'item_id', 'rating', 'timestamp'] | ||
ratings = pd.read_csv(filename, sep='::', names=names, engine='python') | ||
return process_movielens(ratings, sort=sort) | ||
|
||
|
||
def load_ml_10m(filename, sort=True): | ||
names = ['user_id', 'item_id', 'rating', 'timestamp'] | ||
ratings = pd.read_csv(filename, sep='::', names=names, engine='python') | ||
return process_movielens(ratings, sort=sort) | ||
|
||
|
||
def load_ml_20m(filename, sort=True): | ||
ratings = pd.read_csv(filename, dtype={'userId': int, 'movieId': int, 'timestamp': int}) | ||
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s') | ||
names = {'userId': 'user_id', 'movieId': 'item_id'} | ||
ratings.rename(columns=names, inplace=True) | ||
return process_movielens(ratings, sort=sort) | ||
|
||
|
||
DATASETS = [k.replace('load_', '') for k in locals().keys() if "load_" in k] | ||
|
||
|
||
def get_dataset_name(filename): | ||
for dataset in DATASETS: | ||
if dataset in filename.replace('-', '_').lower(): | ||
return dataset | ||
raise NotImplementedError | ||
|
||
|
||
def implicit_load(filename, sort=True): | ||
func = globals()["load_" + get_dataset_name(filename)] | ||
return func(filename, sort=sort) |
Oops, something went wrong.