adding Recommendation model

tbd-ai · Nov 13, 2018 · 9a75513 · 9a75513
1 parent c8bfa39
commit 9a75513
Show file tree

Hide file tree

Showing 13 changed files with 870 additions and 0 deletions.
diff --git a/Recommendation-NCF/PyTorch/.gitignore b/Recommendation-NCF/PyTorch/.gitignore
@@ -0,0 +1,14 @@
+#data:
+ml-20m.zip
+*/ml-20m/*
+*.csv
+#IDE folder:
+*/.idea/*
+__pycache__
+#nvprof output
+*my_nvpof_output_*
+#run
+*/run/*
+#nvprof
+nvprof_data/*
+!nvprof_data/.gitkeep
diff --git a/Recommendation-NCF/PyTorch/dataset/download_dataset.sh b/Recommendation-NCF/PyTorch/dataset/download_dataset.sh
@@ -0,0 +1,16 @@
+function download_20m {
+	echo "Download ml-20m"
+	curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip
+}
+
+function download_1m {
+	echo "Downloading ml-1m"
+	curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip
+}
+
+if [[ $1 == "ml-1m" ]]
+then
+	download_1m
+else
+	download_20m
+fi
diff --git a/Recommendation-NCF/PyTorch/dataset/verify_dataset.sh b/Recommendation-NCF/PyTorch/dataset/verify_dataset.sh
@@ -0,0 +1,44 @@
+function get_checker {
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        checkmd5=md5
+    else
+        checkmd5=md5sum
+    fi
+
+    echo $checkmd5
+}
+
+
+function verify_1m {
+    # From: curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip.md5
+    hash=<(echo "MD5 (ml-1m.zip) = c4d9eecfca2ab87c1945afe126590906")
+    local checkmd5=$(get_checker)
+    if diff <($checkmd5 ml-1m.zip) $hash &> /dev/null
+    then
+        echo "PASSED"
+    else
+        echo "FAILED"
+    fi
+}
+
+function verify_20m {
+    # From: curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip.md5
+    hash=<(echo "MD5 (ml-20m.zip) = cd245b17a1ae2cc31bb14903e1204af3")
+    local checkmd5=$(get_checker)
+
+    if diff <($checkmd5 ml-20m.zip) $hash &> /dev/null
+    then
+        echo "PASSED"
+    else
+        echo "FAILED"
+    fi
+
+}
+
+
+if [[ $1 == "ml-1m" ]]
+then
+    verify_1m
+else
+    verify_20m
+fi
diff --git a/Recommendation-NCF/PyTorch/nvprof_data/.gitkeep b/Recommendation-NCF/PyTorch/nvprof_data/.gitkeep
@@ -0,0 +1 @@
+Placeholder file to commit empty folder
diff --git a/Recommendation-NCF/PyTorch/scripts/benchmark-ncf.sh b/Recommendation-NCF/PyTorch/scripts/benchmark-ncf.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# source scripts/benchmark-ncf.sh # Train
+# source scripts/benchmark-ncf.sh --profile # Profile for compute utilization
+# source scripts/benchmark-ncf.sh --profile-fp32 # Profile for fp32 utilization
+
+# Using a seed of 1
+seed=1
+skip_data=0
+THRESHOLD=0.635
+
+if [ $# -eq 0 ]
+then
+        echo "nvprof disabled"
+        COMMAND="python ./source/ncf.py ml-20m -l 0.0005 -b 2048 --layers 256 256 128 64 -f 64 --seed $seed \
+    	--threshold $THRESHOLD --processes 1 --workers 0"
+
+elif [ "$1" == "--profile" ]
+then
+        echo "nvprof is profiling compute utilization"
+        COMMAND="nvprof --profile-from-start off --export-profile nvprof_data/compute_utilization%p.nvvp --print-summary \
+        python ./source/ncf.py ml-20m -l 0.0005 -b 2048 --layers 256 256 128 64 -f 64 --seed $seed \
+        --threshold $THRESHOLD --processes 1 --workers 0 --profile"
+
+elif [ "$1" == "--profile-fp32" ]
+then
+        echo "nvprof is profiling fp32 utilization"
+        COMMAND="nvprof --profile-from-start off --metrics single_precision_fu_utilization --export-profile \
+        nvprof_data/fp32_utilization%p.nvvp --print-summary python ./source/ncf.py ml-20m -l 0.0005 -b 2048 --layers \
+        256 256 128 64 -f 64 --seed $seed --threshold $THRESHOLD --processes 1 --workers 0 --profile"
+
+else
+	echo "Invalid input argument. Valid ones are --profile --profile-fp32."
+	return -1
+fi
+
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+echo "unzip ml-20m.zip"
+if unzip ml-20m.zip
+then
+    if [ $skip_data -eq 0 ]
+    then
+        echo "Start processing ml-20m/ratings.csv"
+        t0=$(date +%s)
+        python ./source/convert.py ml-20m/ratings.csv ml-20m --negatives 999
+        t1=$(date +%s)
+        delta=$(( $t1 - $t0 ))
+        echo "Finish processing ml-20m/ratings.csv in $delta seconds"
+    else
+        echo "Skipped data processing"
+    fi
+
+    echo "Start training"
+    t0=$(date +%s)
+    $COMMAND
+    t1=$(date +%s)
+	delta=$(( $t1 - $t0 ))
+    echo "Finish training in $delta seconds"
+
+	# end timing
+	end=$(date +%s)
+	end_fmt=$(date +%Y-%m-%d\ %r)
+	echo "ENDING TIMING RUN AT $end_fmt"
+
+
+	# report result
+	result=$(( $end - $start ))
+	result_name="recommendation"
+
+
+	echo "RESULT,$result_name,$seed,$result,$USER,$start_fmt"
+else
+	echo "Problem unzipping ml-20.zip"
+	echo "Please run 'download_data.sh && verify_datset.sh' first"
+fi
+
+
+
+
+
diff --git a/Recommendation-NCF/PyTorch/source/convert.py b/Recommendation-NCF/PyTorch/source/convert.py
@@ -0,0 +1,101 @@
+import os
+from argparse import ArgumentParser
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from load import implicit_load
+
+
+MIN_RATINGS = 20
+
+
+USER_COLUMN = 'user_id'
+ITEM_COLUMN = 'item_id'
+
+
+TRAIN_RATINGS_FILENAME = 'train-ratings.csv'
+TEST_RATINGS_FILENAME = 'test-ratings.csv'
+TEST_NEG_FILENAME = 'test-negative.csv'
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('path', type=str,
+                        help='Path to reviews CSV file from MovieLens')
+    parser.add_argument('output', type=str,
+                        help='Output directory for train and test CSV files')
+    parser.add_argument('-n', '--negatives', type=int, default=999,
+                        help='Number of negative samples for each positive'
+                             'test example')
+    parser.add_argument('-s', '--seed', type=int, default=0,
+                        help='Random seed to reproduce same negative samples')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    np.random.seed(args.seed)
+
+    print("Loading raw data from {}".format(args.path))
+    df = implicit_load(args.path, sort=False)
+    print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
+    grouped = df.groupby(USER_COLUMN)
+    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)
+
+    print("Mapping original user and item IDs to new sequential IDs")
+    original_users = df[USER_COLUMN].unique()
+    original_items = df[ITEM_COLUMN].unique()
+
+    user_map = {user: index for index, user in enumerate(original_users)}
+    item_map = {item: index for index, item in enumerate(original_items)}
+
+    df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user])
+    df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item])
+
+    assert df[USER_COLUMN].max() == len(original_users) - 1
+    assert df[ITEM_COLUMN].max() == len(original_items) - 1
+
+    print("Creating list of items for each user")
+    # Need to sort before popping to get last item
+    df.sort_values(by='timestamp', inplace=True)
+    all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN]))
+    user_to_items = defaultdict(list)
+    for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)):
+        user_to_items[getattr(row, USER_COLUMN)].append(getattr(row, ITEM_COLUMN))  # noqa: E501
+
+    test_ratings = []
+    test_negs = []
+    all_items = set(range(len(original_items)))
+    print("Generating {} negative samples for each user"
+          .format(args.negatives))
+    for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)):  # noqa: E501
+        test_item = user_to_items[user].pop()
+
+        all_ratings.remove((user, test_item))
+        all_negs = all_items - set(user_to_items[user])
+        all_negs = sorted(list(all_negs))  # determinism
+
+        test_ratings.append((user, test_item))
+        test_negs.append(list(np.random.choice(all_negs, args.negatives)))
+
+    print("Saving train and test CSV files to {}".format(args.output))
+    df_train_ratings = pd.DataFrame(list(all_ratings))
+    df_train_ratings['fake_rating'] = 1
+    df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME),
+                            index=False, header=False, sep='\t')
+
+    df_test_ratings = pd.DataFrame(test_ratings)
+    df_test_ratings['fake_rating'] = 1
+    df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME),
+                           index=False, header=False, sep='\t')
+
+    df_test_negs = pd.DataFrame(test_negs)
+    df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME),
+                        index=False, header=False, sep='\t')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/Recommendation-NCF/PyTorch/source/dataset.py b/Recommendation-NCF/PyTorch/source/dataset.py
@@ -0,0 +1,57 @@
+import numpy as np
+import scipy
+import scipy.sparse
+import torch
+import torch.utils.data
+
+
+class CFTrainDataset(torch.utils.data.dataset.Dataset):
+    def __init__(self, train_fname, nb_neg):
+        self._load_train_matrix(train_fname)
+        self.nb_neg = nb_neg
+
+    def _load_train_matrix(self, train_fname):
+        def process_line(line):
+            tmp = line.split('\t')
+            return [int(tmp[0]), int(tmp[1]), float(tmp[2]) > 0]
+        with open(train_fname, 'r') as file:
+            data = list(map(process_line, file))
+        self.nb_users = max(data, key=lambda x: x[0])[0] + 1
+        self.nb_items = max(data, key=lambda x: x[1])[1] + 1
+
+        self.data = list(filter(lambda x: x[2], data))
+        self.mat = scipy.sparse.dok_matrix(
+                (self.nb_users, self.nb_items), dtype=np.float32)
+        for user, item, _ in data:
+            self.mat[user, item] = 1.
+
+    def __len__(self):
+        return (self.nb_neg + 1) * len(self.data)
+
+    def __getitem__(self, idx):
+        if idx % (self.nb_neg + 1) == 0:
+            idx = idx // (self.nb_neg + 1)
+            return self.data[idx][0], self.data[idx][1], np.ones(1, dtype=np.float32)  # noqa: E501
+        else:
+            idx = idx // (self.nb_neg + 1)
+            u = self.data[idx][0]
+            j = torch.LongTensor(1).random_(0, self.nb_items).item()
+            while (u, j) in self.mat:
+                j = torch.LongTensor(1).random_(0, self.nb_items).item()
+            return u, j, np.zeros(1, dtype=np.float32)
+
+
+def load_test_ratings(fname):
+    def process_line(line):
+        tmp = map(int, line.split('\t')[0:2])
+        return list(tmp)
+    ratings = map(process_line, open(fname, 'r'))
+    return list(ratings)
+
+
+def load_test_negs(fname):
+    def process_line(line):
+        tmp = map(int, line.split('\t'))
+        return list(tmp)
+    negs = map(process_line, open(fname, 'r'))
+    return list(negs)
diff --git a/Recommendation-NCF/PyTorch/source/load.py b/Recommendation-NCF/PyTorch/source/load.py
@@ -0,0 +1,68 @@
+from collections import namedtuple
+
+import pandas as pd
+
+
+RatingData = namedtuple('RatingData',
+                        ['items', 'users', 'ratings', 'min_date', 'max_date'])
+
+
+def describe_ratings(ratings):
+    info = RatingData(items=len(ratings['item_id'].unique()),
+                      users=len(ratings['user_id'].unique()),
+                      ratings=len(ratings),
+                      min_date=ratings['timestamp'].min(),
+                      max_date=ratings['timestamp'].max())
+    print("{ratings} ratings on {items} items from {users} users"
+          " from {min_date} to {max_date}"
+          .format(**(info._asdict())))
+    return info
+
+
+def process_movielens(ratings, sort=True):
+    ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
+    if sort:
+        ratings.sort_values(by='timestamp', inplace=True)
+    describe_ratings(ratings)
+    return ratings
+
+
+def load_ml_100k(filename, sort=True):
+    names = ['user_id', 'item_id', 'rating', 'timestamp']
+    ratings = pd.read_csv(filename, sep='\t', names=names)
+    return process_movielens(ratings, sort=sort)
+
+
+def load_ml_1m(filename, sort=True):
+    names = ['user_id', 'item_id', 'rating', 'timestamp']
+    ratings = pd.read_csv(filename, sep='::', names=names, engine='python')
+    return process_movielens(ratings, sort=sort)
+
+
+def load_ml_10m(filename, sort=True):
+    names = ['user_id', 'item_id', 'rating', 'timestamp']
+    ratings = pd.read_csv(filename, sep='::', names=names, engine='python')
+    return process_movielens(ratings, sort=sort)
+
+
+def load_ml_20m(filename, sort=True):
+    ratings = pd.read_csv(filename, dtype={'userId': int, 'movieId': int, 'timestamp': int})
+    ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
+    names = {'userId': 'user_id', 'movieId': 'item_id'}
+    ratings.rename(columns=names, inplace=True)
+    return process_movielens(ratings, sort=sort)
+
+
+DATASETS = [k.replace('load_', '') for k in locals().keys() if "load_" in k]
+
+
+def get_dataset_name(filename):
+    for dataset in DATASETS:
+        if dataset in filename.replace('-', '_').lower():
+            return dataset
+    raise NotImplementedError
+
+
+def implicit_load(filename, sort=True):
+    func = globals()["load_" + get_dataset_name(filename)]
+    return func(filename, sort=sort)