In [1]:
import torch
import random
import pandas as pd
from copy import deepcopy
from torch.utils.data import DataLoader, Dataset

random.seed(0)


class UserItemRatingDataset(Dataset):
    """Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
    def __init__(self, user_tensor, item_tensor, target_tensor):
        """
        args:

            target_tensor: torch.Tensor, the corresponding rating for <user, item> pair
        """
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.target_tensor = target_tensor

    def __getitem__(self, index):
        return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]

    def __len__(self):
        return self.user_tensor.size(0)


class SampleGenerator(object):
    """Construct dataset for NCF"""

    def __init__(self, ratings):
        """
        args:
            ratings: pd.DataFrame, which contains 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
        """
        assert 'userId' in ratings.columns
        assert 'itemId' in ratings.columns
        assert 'rating' in ratings.columns

        self.ratings = ratings
        # explicit feedback using _normalize and implicit using _binarize
        # self.preprocess_ratings = self._normalize(ratings)
        self.preprocess_ratings = self._binarize(ratings)
        self.user_pool = set(self.ratings['userId'].unique())
        self.item_pool = set(self.ratings['itemId'].unique())
        # create negative item samples for NCF learning
        self.negatives = self._sample_negative(ratings)
        self.train_ratings, self.test_ratings = self._split_loo(self.preprocess_ratings)
        
        file_train = "train_ratings"
        train_rating_binary = self.train_ratings[['userId', 'itemId']]
        # print(train_rating_binary)
        train_rating_binary.to_csv(file_train, header=None, index=None)

    def _normalize(self, ratings):
        """normalize into [0, 1] from [0, max_rating], explicit feedback"""
        ratings = deepcopy(ratings)
        max_rating = ratings.rating.max()
        ratings['rating'] = ratings.rating * 1.0 / max_rating
        return ratings
    
    def _binarize(self, ratings):
        """binarize into 0 or 1, implicit feedback"""
        ratings = deepcopy(ratings)
        ratings['rating'][ratings['rating'] > 0] = 1  #和下面的LOO相关
        return ratings

    def _split_loo(self, ratings):
        """leave one out train/test split """
        ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
        test = ratings[ratings['rank_latest'] == 1] # explicit时 test loo 选择评分最高的那一个gt : 当负采样采用的是1为阈值时，最好在这里随机取，如果采用4为阈值，LOO的评分越高越好
        train = ratings[ratings['rank_latest'] > 1]
        assert train['userId'].nunique() == test['userId'].nunique()
        return train[['userId', 'itemId', 'rating']], test[['userId', 'itemId', 'rating']]

    def _sample_negative(self, ratings):
        """return all negative items & 100 sampled negative items"""
        interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename(
            columns={'itemId': 'interacted_items'})
        interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
        interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))
        return interact_status[['userId', 'negative_items', 'negative_samples']]
    
    

    def instance_a_train_loader(self, num_negatives, batch_size):
        """instance train loader for one training epoch"""
        users, items, ratings = [], [], []
        train_ratings = pd.merge(self.train_ratings, self.negatives[['userId', 'negative_items']], on='userId')
        train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, num_negatives))
        for row in train_ratings.itertuples():
            users.append(int(row.userId))
            items.append(int(row.itemId))
            ratings.append(float(row.rating))
            for i in range(num_negatives):
                users.append(int(row.userId))
                items.append(int(row.negatives[i]))
                ratings.append(float(0))  # negative samples get 0 rating
        dataset = UserItemRatingDataset(user_tensor=torch.LongTensor(users),
                                        item_tensor=torch.LongTensor(items),
                                        target_tensor=torch.FloatTensor(ratings))
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

    @property
    def evaluate_data(self):
        """create evaluate data"""
        file_name = 'test_negative_ratings'
        test_ratings = pd.merge(self.test_ratings, self.negatives[['userId', 'negative_samples']], on='userId')
        test_users, test_items, negative_users, negative_items = [], [], [], []
        with open(file_name, 'w') as f:
            for row in test_ratings.itertuples():
                test_users.append(int(row.userId))
                test_items.append(int(row.itemId))
                t_rating = (int(row.userId), int(row.itemId))
                f.write(str(t_rating))
                for i in range(len(row.negative_samples)):
                    negative_users.append(int(row.userId))
                    negative_items.append(int(row.negative_samples[i]))
                    f.write('\t')
                    f.write(str(row.negative_samples[i]))
                f.write('\n')
        return [torch.LongTensor(test_users), torch.LongTensor(test_items), torch.LongTensor(negative_users),
                torch.LongTensor(negative_items)]


In [2]:
import pandas as pd
import numpy as np

ml1m_dir = 'ratings.csv'
ml1m_rating = pd.read_csv(ml1m_dir)
# ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'],  engine='python')
ml1m_rating.columns = ['userId', 'itemId', 'rating', 'timestamp']
print(ml1m_rating)

        userId  itemId  rating  timestamp
0            0       0       5  978300760
1            0       1       3  978302109
2            0       2       3  978301968
3            0       3       4  978300275
4            0       4       5  978824291
...        ...     ...     ...        ...
999606    6039    2893       5  956715569
999607    6039    2937       1  956716438
999608    6039    3018       5  956704305
999609    6039    3022       3  960971992
999610    6039    3344       5  956704191

[999611 rows x 4 columns]


In [14]:
user_id = ml1m_rating[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))
ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')  # merge 是很好的借助pandas进行index的方法
item_id = ml1m_rating[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')
ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]
print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(), ml1m_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(), ml1m_rating.itemId.max()))

Range of userId is [0, 6039]
Range of itemId is [0, 3705]


In [3]:
print(min(ml1m_rating.groupby('itemId').agg('count').userId))
sample_generator = SampleGenerator(ratings=ml1m_rating)
evaluate_data = sample_generator.evaluate_data
print(sample_generator.train_ratings)
print(type(sample_generator.evaluate_data))
# print('user:', min(ml1m_rating.groupby('userId').agg('count').userId))
# print('item:', min(ml1m_rating.groupby('itemId').agg('count').itemId))

5
        userId  itemId  rating
0            0       0       1
1            0       1       1
2            0       2       1
3            0       3       1
4            0       4       1
...        ...     ...     ...
999606    6039    2893       1
999607    6039    2937       1
999608    6039    3018       1
999609    6039    3022       1
999610    6039    3344       1

[993571 rows x 3 columns]
<class 'list'>


In [None]:
# 随机将整个数据集划分为三部分，用于target, shadow, 和 vectorization, 满足MIA_rs的设定，需要保证三个数据集的item集合是一致的
from tqdm import tqdm
np.random.seed(0)
train_df = pd.DataFrame([], columns=['uid', 'iid', 'rating'])
test_df = pd.DataFrame([], columns=['uid', 'iid', 'rating'])
users = pdr['uid'].unique()
for i in tqdm(range(len(users))):
    user_data = pdr[pdr['uid'] == i]
    total = len(user_data)

    n_train = int(total * 0.5)
    train_idx = np.random.choice(total, n_train, replace=False)

    n_test = total - n_train
    test_idx = list(set(np.arange(total)) - set(train_idx))

    user_train = user_data.iloc[np.sort(train_idx), :]
    user_test = user_data.iloc[np.sort(test_idx), :]

    train_df = train_df.append(user_train, ignore_index=True)
    test_df = test_df.append(user_test, ignore_index=True)

train_df['rating'] = np.float16(train_df['rating'])
test_df['rating'] = np.float16(test_df['rating'])
train_df.to_csv('squ0_train_shadow.csv', header=None, index=False)
test_df.to_csv('squ0_test_shadow.csv', header=None, index=False)

In [7]:
import random
user_list = list(np.arange(6040))
member_list = random.sample(user_list, 3020)
member_dict={}
att_list=[]
for x in range(6040):
    if x in member_list:
        member_dict[x] = 1
    else:
        member_dict[x] = 0
    att_list.append(member_dict[x])
np.save('mia/my_file.npy', member_dict)
df = {"userId": user_list, "member": att_list}
df = pd.core.frame.DataFrame(df)
ml1m_rating_pp = pd.merge(ml1m_rating, df, on=['userId'], how='left')

member_pd = ml1m_rating_pp[ml1m_rating_pp.member==1][['userId', 'itemId', 'rating', 'timestamp']]
non_member_pd = ml1m_rating_pp[ml1m_rating_pp.member==0][['userId', 'itemId', 'rating', 'timestamp']]

print(member_pd)

        userId  itemId  rating  timestamp
0            0       0       5  978300760
1            0       1       3  978302109
2            0       2       3  978301968
3            0       3       4  978300275
4            0       4       5  978824291
...        ...     ...     ...        ...
999740    6037     213       2  956707005
999741    6037     243       1  956715051
999742    6037     273       3  956707604
999743    6037     798       3  956706827
999744    6037     183       5  956707547

[502737 rows x 4 columns]


In [6]:
# sample_generator = SampleGenerator(ratings=ml1m_rating)
sample_generator = SampleGenerator(ratings=ml1m_rating)
evaluate_data = sample_generator.evaluate_data
print(sample_generator_mem.train_ratings)
print(type(sample_generator_mem.evaluate_data))

         userId  itemId  rating
0             0       0       1
1             0       1       1
2             0       2       1
3             0       3       1
5             0       5       1
...         ...     ...     ...
1000204    6039     772       1
1000205    6039    1106       1
1000206    6039     365       1
1000207    6039     152       1
1000208    6039      26       1

[994169 rows x 3 columns]
<class 'list'>


In [20]:
file_name = 'mia/test_negative_ratings'
with open(file_name, 'r') as f:
    ll = f.readline()
arr = ll.split('\t')
u = eval(arr[0])[0]

print(eval(arr[0])[1])

25


In [121]:
import pandas as pd
import numpy as np
train_data_a = pd.read_csv(
		"mia/train_ratings", 
		header=None, names=['user', 'item'], 
		 dtype={0: np.int32, 1: np.int32})

train_data_b = pd.read_csv(
		"/home/xcl-python/NCF/Data/ml-1m.train.rating", 
		sep='\t', header=None, names=['user', 'item'], 
		usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

set_a = set(train_data_a[train_data_a.user==1]['item'].unique())
set_b = set(train_data_b[train_data_b.user==1]['item'].unique())
print(set_a)
print(set_b)
# sum = 0
# sum_a = 0
# sum_b = 0
# for i in range(6040):
#     set_a = set(train_data_a[train_data_a.user==i]['item'].unique())
#     set_b = set(train_data_b[train_data_b.user==i]['item'].unique())
#     sum += len(set_a) - len(set_b)
#     sum_a += len(set_a)
#     sum_b += len(set_b)
# print(sum)
# print(sum_a)
# print(sum_b)

{0, 18, 20, 42, 47, 48, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174}
{0, 18, 20, 42, 47, 48, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 134, 135, 136, 137, 138, 139, 140

In [58]:
train_data = pd.read_csv(
		"/home/xcl-python/NCF/Data/ml-1m.train.rating", 
		sep='\t', header=None, names=['user', 'item'], 
		usecols=[0, 1], dtype={0: np.int32, 1: np.int32})
set_b = set(train_data[train_data.user==0]['item'].unique())
print(len(set_a - set_b))

0


In [7]:
evaluate_data = sample_generator.evaluate_data
print(type(evaluate_data[0]))

<class 'torch.Tensor'>
