In [44]:
# 标准库
import argparse
from time import time
import sys

# 第三方库
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np

# 自定义库
from evaluate import evaluate_model
import utils
from dataset import Dataset


In [45]:

# 是否激活cuda
if torch.cuda.is_available():
    print(torch.cuda.is_available())
    torch.backends.cudnn.deterministic = True
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class DMF(nn.Module):


    def __init__(self, num_users, num_items, layers, dataset):
        super(DMF, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.latent_dim = layers[0]
        self.layers = layers

        self.user_item_indices = torch.LongTensor(np.array([dataset.user_indices, dataset.item_incides]))
        self.rating_data = torch.FloatTensor(dataset.rating_data)
        self.user_item_matrix = torch.sparse_coo_tensor(self.user_item_indices, self.rating_data,
                                                        torch.Size((self.num_users, self.num_items))).to_dense().to(device)

        self.linear_user_1 = nn.Linear(in_features=self.num_items, out_features=self.latent_dim)
        self.linear_user_1.weight.detach().normal_(0, 0.01)
        self.linear_item_1 = nn.Linear(in_features=self.num_users, out_features=self.latent_dim)
        self.linear_item_1.weight.detach().normal_(0, 0.01)
        # 想办法添加自编码器

        self.user_fc_layers = nn.ModuleList()
        for idx in range(1, len(self.layers)):
            self.user_fc_layers.append(nn.Linear(in_features=self.layers[idx - 1], out_features=self.layers[idx]))

        self.item_fc_layers = nn.ModuleList()
        for idx in range(1, len(self.layers)):
            self.item_fc_layers.append(nn.Linear(in_features=self.layers[idx - 1], out_features=self.layers[idx]))

    def forward(self, user_indices, item_indices):#user_indices,item_indices代表数据集中的user 和 item 并把rating置为1

        user = self.user_item_matrix[user_indices]
        item = self.user_item_matrix[:, item_indices].t()

        user = self.linear_user_1(user)
        item = self.linear_item_1(item)

        for idx in range(len(self.layers) - 1):
            user = F.relu(user)
            user = self.user_fc_layers[idx](user)

        for idx in range(len(self.layers) - 1):
            item = F.relu(item)
            item = self.item_fc_layers[idx](item)

        vector = torch.cosine_similarity(user, item).view(-1, 1)
        vector = torch.clamp(vector, min=0, max=2)
        # print("模型输出vector的shape：")
        # print(vector.shape)

        return vector


True


In [46]:

#################### Arguments ####################

parser = argparse.ArgumentParser(description="Run Conv1.")
parser.add_argument('--path', nargs='?', default='data/',
                        help='Input data path.')
# parser.add_argument('--path', nargs='?', default='./data/',
#                         help='Input data path.')
# parser.add_argument('--dataset', nargs='?', default='OLIES2011',
#                         help='Choose a dataset.')
# parser.add_argument('--dataset', nargs='?', default='L_DMF_input',
#                         help='Choose a dataset.')
parser.add_argument('--dataset', nargs='?', default='assistment2009',
                        help='Choose a dataset.')
parser.add_argument('--epochs', type=int, default=20,
                        help='Number of epochs.')
parser.add_argument('--batch_size', type=int, default=256,
                        help='Batch size.')
parser.add_argument('--num_factors', type=int, default=64,
                        help='Embedding size.')
parser.add_argument('--layers', nargs='?', default='[64,64]',
                        help="Size of each layer. Note that the first layer is the concatenation of user and item embeddings. So layers[0]/2 is the embedding size.")
parser.add_argument('--reg', type=float, default='0.0',
                        help="Regularization for each layer")
parser.add_argument('--num_neg', type=int, default=1,
                        help='Number of negative instances to pair with a positive instance.')
parser.add_argument('--lr', type=float, default=0.0001,
                        help='Learning rate.')
parser.add_argument('--learner', nargs='?', default='adam',
                        help='Specify an optimizer: adagrad, adam, rmsprop, sgd')
parser.add_argument('--verbose', type=int, default=1,
                        help='Show performance per X iterations')
parser.add_argument('--out', type=int, default=1,
                        help='Whether to save the trained model.')
parser.add_argument('--emlp_pretrain', nargs='?', default='',
                        help='Specify the pretrain model file for MLP part. If empty, no pretrain will be used')
    
# args = parser.parse_args()
args =parser.parse_known_args()[0]
args = parser.parse_args(args=[])
if __name__ == '__main__':


    # settings
    
    path = args.path
    dataset_name = args.dataset
    epochs = args.epochs
    batch_size = args.batch_size
    layers = eval(args.layers)
    latent_dim = layers[0]
    # 
    # print("latent_dim")
    # print(latent_dim)
    reg = args.reg
    learning_rate = args.lr
    num_negative = args.num_neg
    verbose = args.verbose
    out = args.out
    emlp_pretrain = args.emlp_pretrain
    topK = 10
    evaluation_threads = 1  # mp.cpu_count()


In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random

In [48]:
# # 测试集的生成
# data3 = pd.read_csv(r"C:\Users\wangxin\Desktop\DMF-vscode\data\OLIES2011_rating.csv",sep=" ").to_numpy()
# # print(data3)
# stu=0
# a=[]#做过的题目
# b=[]
# test0=[]
# train0=[]
# for i in data3:

    
#     if stu==i[0]:
#         # test0.append(i)
#         a.append(i)             #暂存
#         b=i
#         # stu=i[0]
#     else:
#         # train0.append(i)
#         if len(a)>=2:               #学习记录至少两条的情况
#             c=random.randint(0,len(a)-1)        #随机取出一条
#             test0.append(a[c])              #放到测试集
#             for x in range(len(a)):
#                 if x!=c:
#                     train0.append(a[x])    #遍历剩余的，放到训练集
#                 # else:
#                 #     continue
            
#             a=[]
#             a.append(i) 

#         else:
#             # print(type(i))
#             # # print(type(a))
#             # print(i)
#             # print(a)
#             # print(np.array(a))
#             test0.append(a[0])
#             train0.append(a[0])
#             a=[]
#             a.append(i) 
    
#         stu=i[0]
    
# # print(pd.DataFrame(test0))
# # print(train0)
# train0=pd.DataFrame(train0)
# train0.to_csv('OLIES2011.train.rating',index=False)
# test0=pd.DataFrame(test0)
# test0.to_csv('OLIES2011.test.rating',index=False)

In [49]:
# import tqdm
# data3 =pd.read_csv("OLIES2011_rating.csv",sep=',').to_numpy()
# print(type(data3))
# for i in tqdm.tqdm(data3):

#     print("i[0]:",i[0])

In [50]:
# # 李磊测试集的生成
# import tqdm
# data3 = pd.read_csv("./data/L_DMF_input_correct.rating",sep=',').to_numpy()
# # print(data3)
# stu=1
# a=[]#做过的题目
# b=[]
# test0=[]
# train0=[]
# for i in tqdm.tqdm(data3):

#     # print("i[0]:",i[0])
#     if stu==int(i[0]):
#         # test0.append(i)
#         a.append(i)             #暂存
#         # print("a:",a)
#         b=i
#         # stu=i[0]
#     else:
#         # train0.append(i)
#         # if len(a)>=2:               #学习记录至少两条的情况
#                 #随机取出一条
        
#         # print("len(a):",len(a))
#         # print("c",c)
#         c=random.randint(0,len(a)-1)
#         test0.append(a[c])              #放到测试集
#         for x in range(len(a)):
#             if x!=c:
#                 train0.append(a[x])    #遍历剩余的，放到训练集
#                 # else:
#                 #     continue
            
#         a=[]
#         a.append(i) 

#         # else:
#         #     # print(type(i))
#         #     # # print(type(a))
#         #     # print(i)
#         #     # print(a)
#         #     # print(np.array(a))
#         #     test0.append(a[0])
#         #     train0.append(a[0])
#         #     a=[]
#         #     a.append(i) 
    
#         stu=i[0]
#         # print("stu:",stu)
# x=[]
# y=[]
# for i in data3:
#     if i[0]==6866:
#         x.append(i)
# c=random.randint(0,len(x)-1)        #随机取出一条
#         # print("len(a):",len(a))
#         # print("c",c)
# test0.append(x[c])              #放到测试集
# for j in range(len(x)):
#     if j!=c:
#         train0.append(x[j])
# # print(pd.DataFrame(test0))
# # print(train0)
# train0=pd.DataFrame(train0)
# train0.to_csv('./data/L_DMF_input_correct.train.rating',index=False)
# # train0.to_csv('./data/L_DMF_input.train.csv',index=False)
# test0=pd.DataFrame(test0)
# test0.to_csv('./data/L_DMF_input_correct.test.rating',index=False)

In [51]:
# # 数据集的处理
# data1 = pd.read_csv(r"C:\Users\wangxin\Desktop\DMF-vscode\data\OLIES2011_rating.csv",sep=' ').to_numpy()


# x_train,x_test= train_test_split(data1,test_size=0.05,random_state=0)

# print(x_train)

# oli_train=pd.DataFrame(x_train)
# oli_train.to_csv('OLIES2011.train.rating',index=False)
# oli_test=pd.DataFrame(x_test)
# oli_test.to_csv('OLIES2011.test.rating',index=False)


In [52]:
# # negative的生成
# data2 = pd.read_csv(r"C:\Users\wangxin\Desktop\DMF-vscode\data\OLIES2011_rating.csv",sep=" ").to_numpy()
# # print(data2)
# negative=[]     #没做过的题题目
# old_t=[]        #做过的题目
# stu=0          #学生编号
# all_t=[]        #所有题目
# a=0
# for i in data2:
    
#     all_t.append(i[1])
# print(len(set(all_t)))
# for i in data2:
#     if stu==i[0]:
#         old_t.append(i[1])
#     else:
#         stu=i[0]
#         negative.append(set(all_t)-set(old_t))
#         old_t=[]
        
#         # negative.append(i[1])
#         continue

# negative.append(set(all_t)-set(old_t))    
# print(len(negative))
# negative=pd.DataFrame(negative)
# negative.to_csv("OLIES2011.test.negative",index=False)
# # print(negative.shape)
# # print(negative)
# # negative=set(negative)
# # print(negative)
# # print(old_t)
# # print(len(negative))
# # print(negative)

In [53]:
#李磊数据，negative的生成，全0
# a=[]
# for i in range(1,6866):
#     a.append(" ")
# a=pd.DataFrame(a)
# a.to_csv("./data/L_DMF_input.test.negative",index=False)

In [54]:
# 李磊数据，negative的生成，做错的题
# a=[]
# for i in range()

In [55]:
# negative的生成
# 李磊数据，negative的生成，做错的题
# data2 = pd.read_csv(r"C:\Users\wangxin\Desktop\remmend_DKT_Apriori_DMF\data\L_DMF_input.csv",sep=',').to_numpy()
# # print(data2)
# negative=[]     #做错的题题目
# a=[]
# b=1
# import tqdm

# for i in tqdm.tqdm(data2):
#     if b==i[0]:
#         if i[2]==0:
#             a.append(i[1])
#     else:
#         b=i[0]
#         negative.append(a)
#         a=[]
#         if i[2]==0:
#             a.append(i[1])
        
       
#         continue
# for i in data2:
#     if i[0]==6866:
#         a.append(i[1])
# negative.append(a)
# negative=pd.DataFrame(negative)
# negative.to_csv("./data/L_DMF_input.test.negative",index=False)

In [56]:
# # 全部做对的题目
# data2 = pd.read_csv(r"C:\Users\wangxin\Desktop\remmend_DKT_Apriori_DMF\data\L_DMF_input.csv",sep=",").to_numpy()
# a=[]
# for i in data2:
    
#     if i[2]==1:
#         a.append(i)
# correct=a
# correct=pd.DataFrame(correct)
# correct.to_csv('./data/L_DMF_input_correct.rating',index=False)

In [57]:
# # 生成negative，以所有题目作为候选习题
# a=[]
# b=[]
# for j in range(1,91):
#         a.append(j)
# for i in range(1,6867):
    
#     b.append(a)
# b=pd.DataFrame(b)
# print(b)
# b.to_csv("./data/L_DMF_input.test.negative",index=False)

In [58]:
# from math import fabs
# import tqdm
# import pandas as pd
# import numpy as np
# import random
# data3 = pd.read_csv('./data/assistment2009.csv',sep=',').to_numpy()
# a=0
# b=0
# c=[]
# d=[]
# stu=0
# for i in data3:
#     if stu==i[0]:
#         a=a+1
#         c.append(i)
#     else:
#         # print('a:',a)
#         # print("stu:",stu)
#         # print('i[0]:',i[0])
#         if a>100:
#             b=b+1
#             d.append(c)
#         a=0
#         stu=i[0]
# print('b:',b)
# d=pd.DataFrame(d)
# tqdm.tqdm(d.to_csv('./data/assistment2009_100.csv',index=False))
            

In [59]:
# # 李磊测试集的生成
# import tqdm
# import pandas as pd
# import numpy as np
# import random
# data3 = pd.read_csv(r"C:\Users\wangxin\Desktop\remmend_DKT_Apriori_DMF\data\assistment2009\assisment2009_DMF_input.csv",sep=',').to_numpy()
# # print(sorted(data3, key=lambda x: x[0]))
# data3=sorted(data3, key=lambda x: x[0])
# data3=pd.DataFrame(data3)
# data3.to_csv('./data/assistment2009.csv',index=False)

In [60]:
# # # print(data3)
# data3 = pd.read_csv('./data/assistment2009_ph.csv',sep=',').to_numpy()
# print(data3)
# stu=0
# a=[]#做过的题目
# b=[]
# test0=[]
# train0=[]
# for i in tqdm.tqdm(data3):

#     # print("i[0]:",i[0])
#     if stu==int(i[0]):
#         # test0.append(i)
#         a.append(i)             #暂存
#         # print("a:",a)
#         b=i
#         # stu=i[0]
#     else:
#         # train0.append(i)
#         if len(a)>=2:               #学习记录至少两条的情况
#                 # 随机取出一条
        
#         # print("len(a):",len(a))
#         # print("c",c)
#             c=random.randint(0,len(a)-1)
#             test0.append(a[c])              #放到测试集
#             for x in range(len(a)):
#                 if x!=c:
#                     train0.append(a[x])    #遍历剩余的，放到训练集
#                 # else:
#                 #     continue
            
#         a=[]
#         a.append(i) 

#         # else:
#         #     # print(type(i))
#         #     # # print(type(a))
#         #     # print(i)
#         #     # print(a)
#         #     # print(np.array(a))
#         #     test0.append(a[0])
#         #     train0.append(a[0])
#         #     a=[]
#         #     a.append(i) 
    
#         stu=i[0]
#         # print("stu:",stu)
# x=[]
# y=[]
# for i in data3:
#     if i[0]==4216:
#         x.append(i)
# c=random.randint(0,len(x)-1)        #随机取出一条
#         # print("len(a):",len(a))
#         # print("c",c)
# test0.append(x[c])              #放到测试集
# for j in range(len(x)):
#     if j!=c:
#         train0.append(x[j])
# # print(pd.DataFrame(test0))
# # print(train0)


In [61]:
# # 别忘了改文件位置
# train0=pd.DataFrame(train0)
# train0.to_csv('./data/assistment2009.train.rating',index=False)
# # train0.to_csv('./data/L_DMF_input.train.csv',index=False)
# test0=pd.DataFrame(test0)
# test0.to_csv('./data/assistment2009.test.rating',index=False)

In [62]:
# # # assistment2009,negative的生成
# import tqdm
# import pandas as pd
# import numpy as np
# data2 = pd.read_csv('./data/assistment2009_ph.csv',sep=',').to_numpy()
# negative=[]     #没做过的题题目
# old_t=[]        #做过的题目
# stu=0          #学生编号
# all_t=[]        #所有题目
# a=0
# for i in tqdm.tqdm(data2):
    
#     all_t.append(i[1])
# print(len(set(all_t)))
# for i in tqdm.tqdm(data2):
#     if stu==i[0]:
#         old_t.append(i[1])
#     else:
#         stu=i[0]
#         negative.append(set(all_t)-set(old_t))
#         old_t=[]
        
#         # negative.append(i[1])
#         continue

# negative.append(set(all_t)-set(old_t))    
# print(len(negative))
# negative=pd.DataFrame(negative)
# negative.to_csv("./data/assistment2009_all.test.negative",index=False)
# # print(negative.shape)
# # print(negative)
# # negative=set(negative)
# # print(negative)
# # print(old_t)
# # print(len(negative))
# # print(negative)

In [63]:
# dataset.py

import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd

class Dataset(object):

    def __init__(self, path):
        self.train_ratings, self.train_num_users, self.train_num_items = self.load_train_rating_file_as_list(path + ".train.rating")
        self.test_ratings, self.test_num_users, self.test_num_items = self.load_test_rating_file_as_list(path + ".test.rating")
        self.num_users = max(self.train_num_users, self.test_num_users)
        self.num_items = max(self.train_num_items, self.test_num_items)
        self.test_negative = self.load_negative_file(path + ".test.negative")
        self.user_item_rating_indices = self.get_user_item_matrix_indices()
        self.user_indices, self.item_incides, self.rating_data = self.user_item_rating_indices
        # print("len(self.test_ratings)")
        # print(len(self.test_ratings))
        # print("len(self.test_negative)")
        # print(len(self.test_negative))
        assert len(self.test_ratings) == len(self.test_negative)
        self.train_dict = self.get_train_dict()

    # def load_test_rating_file_as_list(self, filename):
    #     test_ratings = []
    #     num_users, num_items = 0, 0
    #     # with open(filename, "r") as f:
    #     #     line = f.readline()
    #     #     while line != None and line != "":
    #     #         arr = line.split(" ")
    #     #         # 
    #     #         print("arr0:")
    #     #         print(arr)
    #     #         # 
        
    #     a=pd.read_csv(filename,sep=" ").to_numpy()
    #     for i in a:
    #         user, item = int(i[0]), int(i[1])
        
    #         num_users = max(num_users, user)
    #         num_items = max(num_items, item)
    #         test_ratings.append([user, item])
            
    #     test_num_users = num_users + 1
    #     test_num_items = num_items + 1
    #     # print("test_ratings shape:")
    #     # print(len(test_ratings))
    #     print("test_ratings")
        
       
    #     print(test_ratings)
    #     print("test_num_users")
    #     print(test_num_users)
        
    #     print("test_num_items") 
    #     print(test_num_items) 
    #     return test_ratings, test_num_users, test_num_items
    
    # def load_negative_file(self, filename):
    #     negativeList = []
    #     # with open(filename, "r") as f:
    #     #     line = f.readline()
    #     #     while line != None and line != "":
    #     #         arr = line.split(" ")
    #     #         # 
    #     #         print("arr1:")
    #     #         print(arr)
    #     #         # 
    #     # a=pd.read_csv(filename,sep=" ").to_numpy()
    #     # for i in a:
    #     data=[]
    #     with open(filename, 'r',encoding='utf-8-sig') as f_input:
    #         for line in f_input:
    #             data.append(list(line.strip().split(' ')))
    #     print(data)
    #     negatives = []
    #     for x in data[0: ]:
                
    #         negatives.append(int(x))
               
    #         negativeList.append(negatives)
           
    #     # print("negativeList len:")
    #     # print(len(negativeList))
    #     return negativeList
    def load_test_rating_file_as_list(self, filename):
        test_ratings = []
        num_users, num_items = 0, 0
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split()
                user, item = int(arr[0]), int(arr[1])
                num_users = max(num_users, user)
                num_items = max(num_items, item)
                test_ratings.append([user, item])
                line = f.readline()
            test_num_users = num_users + 1
            test_num_items = num_items + 1
        # print("test_ratings shape:")
        # print(len(test_ratings))
            #     # print("test_ratings shape:")
        # print(len(test_ratings))
    #     print("test_ratings")
        
       
    #     print(test_ratings)
    #     print("test_num_users")
    #     print(test_num_users)
        
    #     print("test_num_items") 
    # #     print(test_num_items) 
        return test_ratings, test_num_users, test_num_items
    def load_negative_file(self, filename):
        negativeList = []
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split()
                negatives = []
                # 
                # print(arr)
                for x in arr[1: ]:
                    negatives.append(int(x))
                negativeList.append(negatives)
                line = f.readline()
        # print("negativeList len:")
        # print(len(negativeList))
        return negativeList

    def load_train_rating_file_as_list(self, filename):
        '''
            return: [[user, item, rating]]
        '''
        # Get number of users and items
        num_users, num_items = 0, 0
        # with open(filename, "r") as f:
        #     line = f.readline()
        #     max_items = 0
        #     while line != None and line != "":
        #         arr = line.split(" ")
        #         # 
        #         print("arr2:")
        #         print(arr)
        #         # 
        a=pd.read_csv(filename,sep=" ").to_numpy()
        for i in a:
            u, i = int(i[0]), int(i[1])
            num_users = max(num_users, u)
            num_items = max(num_items, i)
    
        train_num_users = num_users + 1
        train_num_items = num_items + 1
        # Construct matrix
        train_ratings = []
        a=pd.read_csv(filename,sep=" ").to_numpy()
        for i in a:
            user, item, rating = int(i[0]), int(i[1]), i[2]
            train_ratings.append([user, item, rating])
            
            # print("train_ratings:")
            # print(train_ratings)
        return train_ratings, train_num_users, train_num_items

    def get_user_item_matrix_indices(self):
        user_indices, item_indices, ratings = [], [], []
        for i in self.train_ratings:
            user_indices.append(i[0])
            item_indices.append(i[1])
            ratings.append(1)

        return [np.array(user_indices), np.array(item_indices), np.array(ratings)]

    def get_user_item_interact_list(self):
        user_item_interact = []
        user, item, rate = [], [], []
        user_idx = int(0)
        for i in self.train_ratings:
            # print(i[0])
            if user_idx != i[0]:
                user_item_interact.append([user, item, rate])
                user_idx += 1
                user, item, rate = [], [], []
            else:
                user.append(i[0])
                item.append(i[1])
                rate.append(i[2])
            # print("user_item_interact:")
            # print(user_item_interact)
        return user_item_interact

    def get_item_user_interact_list(self):
        item_user_interact = []
        user, item, rate = [], [], []
        item_idx = 0
        for i in self.train_ratings:
            if item_idx != i[1]:
                item_user_interact.append([user, item, rate])
                item_idx += 1
                user, item, rate = [], [], []
            else:
                user.append(i[0])
                item.append(i[1])
                rate.append(i[2])
        # print("item_user_interact:")
        # print(item_user_interact)
        return item_user_interact

    def get_train_instances(self, num_negative):
        user, item, rate = [], [], []
        for i in self.train_ratings:
            user.append(i[0])
            item.append(i[1])
            rate.append(1)
            # print("添加负例前[np.array(user), np.array(item), np.array(rate)]:")
            # print([np.array(user), np.array(item), np.array(rate)])
            for t in range(num_negative):
                j = np.random.randint(self.num_items)
                while (i[0], j) in self.train_dict:
                    j = np.random.randint(self.num_items)
                user.append(i[0])
                item.append(j)
                rate.append(0)
            # print("添加负例后[np.array(user), np.array(item), np.array(rate)]:")
            # print([np.array(user), np.array(item), np.array(rate)])
        return [np.array(user), np.array(item), np.array(rate)]


    def get_user_and_item_matrix(self):
        rom = np.random.rand(1, 100)
        user_matrix = self.user_item_matrix
        item_matrix = self.user_item_matrix.T
        return user_matrix, item_matrix

    def get_train_dict(self):
        data_dict = {}
        for i in self.train_ratings:
            data_dict[(i[0], i[1])] = i[2]
        return data_dict

    def get_item_sparse_matrix(self):
        num_users, num_items = self.num_users, self.num_items
        user_indices, item_incides, rating_data = self.user_item_rating_indices
        item_sparse_matrix = csr_matrix((rating_data, (item_incides, user_indices)), shape=(num_items, num_users))
        return item_sparse_matrix

    def get_user_sparse_matrix(self):
        user_sparse_matrix = self.get_item_sparse_matrix().T
        return user_sparse_matrix


In [65]:

# Loading data
t1 = time()
dataset = Dataset(path + dataset_name)
train, test_rating, test_negative = dataset.get_train_instances(
    num_negative), dataset.test_ratings, dataset.test_negative
num_users, num_items = dataset.num_users, dataset.num_items
t2 = time()
print("header:data, load time:{:.1f}, user:{:d},train:{:d} item:{:d}, test:{:d}"
        .format(t2 - t1, num_users, 
        len(train[0]), num_items, len(test_rating)))


header:data, load time:8.8, user:4217,train:675726 item:26688, test:4097


In [66]:
# 自定义loss函数
class MSELoss(nn.Module):
    def __init__(self):
        super(MSELoss, self).__init__()

    def forward(self, x, y):
        mse_loss = torch.mean(torch.pow((x - y), 2)) # x与y相减后平方，求均值即为MSE
        return mse_loss


In [67]:

train = utils.UserItemRatingDataset(train[0], train[1], train[2])
train = DataLoader(train, batch_size=batch_size, shuffle=True)

    # Build model
model = DMF(num_users, num_items, layers, dataset)
model = model.to(device)
# 更改loss函数
criterion = nn.BCELoss()
# criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=reg)
print(model)


DMF(
  (linear_user_1): Linear(in_features=26688, out_features=64, bias=True)
  (linear_item_1): Linear(in_features=4217, out_features=64, bias=True)
  (user_fc_layers): ModuleList(
    (0): Linear(in_features=64, out_features=64, bias=True)
  )
  (item_fc_layers): ModuleList(
    (0): Linear(in_features=64, out_features=64, bias=True)
  )
)


In [68]:


model_name = str(model.__class__)[17:][:-2]
loss_name = str(criterion.__class__)[17 + 13:][:-2]
print(
    'header:class:{},dataset:{}, batch_size:{}, epochs:{}, latent_dim:{}, num_negative:{}, topK:{}, lr:{}, reg:{},loss:{}'
    .format(model_name, dataset_name, batch_size, epochs, latent_dim, num_negative, topK, learning_rate, reg,
            loss_name))


header:class:DMF,dataset:assistment2009, batch_size:256, epochs:20, latent_dim:64, num_negative:1, topK:10, lr:0.0001, reg:0.0,loss:MSELoss


In [69]:
# evaluate.py

import math
import heapq # for retrieval topK
import multiprocessing
from operator import index
import numpy as np
import torch

# 是否激活cuda
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None

# '''
# Evaluate the performance of Top-K recommendation:
#     Protocol: leave-1-out evaluation
#     Measures: Hit Ratio and NDCG
#     (more details are in: Xiangnan He, et al. Fast Matrix Factorization for Online Recommendation with Implicit Feedback. SIGIR'16)
# @author: hexiangnan
# '''
def evaluate_model(model, testRatings, testNegatives, K, num_thread,epoch):
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    global _model
    global _testRatings
    global _testNegatives
    global _K
    global _dataset
    global _epoch
    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _K = K
    _epoch = epoch

    hits, ndcgs, mrrs = [],[], []
    if(num_thread > 1): # Multi-thread
        pool = multiprocessing.Pool(processes=num_thread)
        res = pool.map(eval_one_rating, range(len(_testRatings)))
        pool.close()
        pool.join()
        hits = [r[0] for r in res]
        ndcgs = [r[1] for r in res]
        return (np.mean(hits), np.mean(ndcgs))
    # Single thread
    for idx in range(len(_testRatings)):
        (hr,ndcg, mrr) = eval_one_rating(idx,epoch)
        hits.append(hr)
        ndcgs.append(ndcg)
        mrrs.append(mrr)
    return (np.mean(hits), np.mean(ndcgs))

def eval_one_rating(idx,epoch):
    rating = _testRatings[idx]

    # print("rating:")
    # print(rating)
    # ####改动
    # items = _testNegatives[idx][0:999]
    items = _testNegatives[idx]

    # print("items:")
    # print(items)
    u = rating[0]
    gtItem = rating[1]
    # print("u  rating[0]:")
    # print(u)
    # print("rating[1]:")
    # print(gtItem)
    items.append(gtItem)
    # print("赋值后的items:")
    # print(items)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype='int64')
    # print("users:")
    # print(users)
    # 可将编码器置于此处，代替Longtensor，

    batch_users, batch_items = torch.LongTensor(users), torch.LongTensor(items)
    # print("batch_users:")
    # print(batch_users)
    # print("batch_items:")
    # print(batch_items)


    tensor_users, tensor_items = batch_users.to(device), batch_items.to(device)
    y_pred = _model(tensor_users, tensor_items) # model predict
    # print(y_pred)
    y_pred = y_pred.cpu()
    y_pred = y_pred.detach().numpy()
    # 
    # print ("y_pred.shape:")
    # print(y_pred.shape)

    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = y_pred[i]
        
    # 推荐指数的获取
    score_index=[]
    map_item_score_sorted = sorted(map_item_score.items(), key=lambda x: x[1],reverse=True)
    # print(map_item_score_sorted)
    exerciseID=list(dict(map_item_score_sorted).keys())
    recommendation_rate = list(dict(map_item_score_sorted).values())
    # print(exerciseID)
    # print(recommendation_rate)
    # d=[]
    # for i in range(len(exerciseID)):
    #     d.append([exerciseID[i],recommendation_rate[i]])
    # print(d)
    # print(map_item_score_sorted)
    # print(exerciseID)
    # for i in exerciseID:
    
    if epoch== 5:
        exercise=[]
        exercise0=[]
        index=[]
        index0=[]
        for i in exerciseID:
            exercise.append(i)
        exercise0.append(exercise)
        for j in recommendation_rate:
            index.append(j)
        index0.append(index)
    # print(exercise)
    # print(index)
        
        exerciseID=pd.DataFrame(exercise0)
        recommendation_rate=pd.DataFrame(index0)
        exerciseID.to_csv('C:/Users/wangxin/Desktop/remmend_DKT_Apriori_DMF/result/assistment2009/DMF_03_05_exercise.csv',mode='a',index=False)
        recommendation_rate.to_csv('C:/Users/wangxin/Desktop/remmend_DKT_Apriori_DMF/result/assistment2009/DMF_03_05_index.csv',mode='a',index=False)
        # with open(r"C:\Users\wangxin\Desktop\remmend_DKT_Apriori_DMF\result\assistment2009\DMF_100_exercise.csv",mode='a',) as f:
        #     # print("学生id：",rating[0],file=f)
        #     print(exerciseID,file=f)
    # for i in recommendation_rate:
    #     print(i)
    # print(recommendation_rate)
    # with open("C:/Users/wangxin/Desktop/remmend_DKT_Apriori_DMF/result",mode='a',) as f:
    #     # print("map_item_score:")
    #     print("学生id:",rating[0],file=f)
    #     # exerciseID=list(map_item_score.keys).sort()
        
    #     
        # print()
    items.pop()
    # Evaluate top rank list
    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    mrr = getMRR(ranklist, gtItem)
    # print(hr, ndcg)
    return (hr, ndcg, mrr)
def testmodel(idx):
    rating = _testRatings[idx]

    # print("rating:")
    # print(rating)
    # items = _testNegatives[idx][0:999]
    items = _testNegatives[idx]

    # print("items:")
    # print(items)
    u = rating[0]
    gtItem = rating[1]
    # print("u  rating[0]:")
    # print(u)
    # print("rating[1]:")
    # print(gtItem)
    items.append(gtItem)
    # print("赋值后的items:")
    # print(items)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype='int64')
    # print("users:")
    # print(users)
    # 可将编码器置于此处，代替Longtensor，

    batch_users, batch_items = torch.LongTensor(users), torch.LongTensor(items)
    # print("batch_users:")
    # print(batch_users)
    # print("batch_items:")
    # print(batch_items)


    tensor_users, tensor_items = batch_users.to(device), batch_items.to(device)
    y_pred = _model(tensor_users, tensor_items) # model predict
    y_pred = y_pred.cpu()
    y_pred = y_pred.detach().numpy()
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

def getMRR(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return float(1.0) / (i+1)
    return 0

In [70]:

    # Init performance
epoch=0
t1 = time()
(hr, ndcg) = evaluate_model(model, test_rating, test_negative, topK, evaluation_threads,epoch)
t2 = time()
    
                                     

In [71]:
import tqdm

In [72]:
                                                                    
import code
from encodings import utf_8


epoch=0
model_out_file = 'C:/Users/wangxin/Desktop/remmend_DKT_Apriori_DMF/pretrain/' + '/{}.model'.format(epoch)
if args.out > 0:
    torch.save(model.state_dict(), model_out_file)

    # Train model
best_hr, best_ndcg, best_iter, best_epoch = 0, 0, -1, -1
count = 0
for epoch in tqdm.tqdm(range(epochs)):
    model.train()
    epoch = epoch + 1
    t1 = time()
    # Generate training instances
    train = dataset.get_train_instances(num_negative)
    train = utils.UserItemRatingDataset(train[0], train[1], train[2])
    train = DataLoader(train, batch_size=batch_size, shuffle=True)
    # Training
    for batch_idx, (user, item, y) in enumerate(train):
            # 改动
            # user, item, y = user.cuda(), item.cuda(), y.cuda()
        user, item, y = user.to(device), item.to(device), y.to(device)
            ## forward and backprop
        y_hat = model(user, item)
        # print("y_hat:")
        # print(y_hat.shape)
        # print("y:")
        # print(y)
        # print("y.view(-1,1):")
        # print(y.view(-1,1))
        loss = criterion(y_hat, y.view(-1, 1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        t2 = time()
        model.eval()
    print('epoch:{},train_time:{:.1f}s, HR:{:.4f}, NDCG:{:.4f}, test_time:{:.1f}s, loss:{:.6f}'
            .format(epoch, t2 - t1, hr, ndcg, time() - t2, loss))
    # with open('C:/Users/wangxin/Desktop/remmend_DKT_Apriori_DMF/result/L_DMF_result.txt',mode='a',encoding='utf-8') as f:
    #     print('epoch:{},train_time:{:.1f}s, HR:{:.4f}, NDCG:{:.4f}, test_time:{:.1f}s, loss:{:.6f}'
    #             .format(epoch, t2 - t1, hr, ndcg, time() - t2, loss),file=f)
    # Evaluation
    if epoch % verbose == 0:
        (hr, ndcg) = evaluate_model(model, test_rating, test_negative, topK, evaluation_threads,epoch)
        print('epoch:{},train_time:{:.1f}s, HR:{:.4f}, NDCG:{:.4f}, test_time:{:.1f}s, loss:{:.6f}'
                .format(epoch, t2 - t1, hr, ndcg, time() - t2, loss))
        with open('C:/Users/wangxin/Desktop/remmend_DKT_Apriori_DMF/result/L_DMF_result.txt',mode='a',encoding='utf-8') as f:
            print('epoch:{},train_time:{:.1f}s, HR:{:.4f}, NDCG:{:.4f}, test_time:{:.1f}s, loss:{:.6f}'
                    .format(epoch, t2 - t1, hr, ndcg, time() - t2, loss),file=f)
        if hr > best_hr:
            count = 0
            best_train_time, best_hr, best_ndcg, best_epoch, best_test_time = t2 - t1, hr, ndcg, epoch, time() - t2
                # model_out_file = 'pretrain/' + '/{}-{}-{}-{}-{}-lr_{}-HR_{:.4f}-NDCG_{:.4f}-epoch_{}.model'.format(
                #     model_name,
                #     dataset_name,
                #     latent_dim,
                #     layers,
                #     num_negative,
                #     learning_rate,
                #     hr,
                #     ndcg,
                #     epoch)
            model_out_file = 'C:/Users/wangxin/Desktop/remmend_DKT_Apriori_DMF/pretrain/' + '/{}.model'.format(epoch)
            if args.out > 0:
                torch.save(model.state_dict(), model_out_file)
        else:
            count += 1
        if count == 50:
            sys.exit(0)

print('best epoch:{},HR:{:.4f}, NDCG:{:.4f}'.format(best_epoch, best_hr, best_ndcg))
if args.out > 0:
    print("The best model is saved to {}".format(model_out_file))

  0%|          | 0/20 [00:00<?, ?it/s]

epoch:1,train_time:20.9s, HR:0.0366, NDCG:0.0255, test_time:0.0s, loss:0.125765


  5%|▌         | 1/20 [02:22<45:08, 142.56s/it]

epoch:1,train_time:20.9s, HR:0.2853, NDCG:0.2003, test_time:121.6s, loss:0.125765
epoch:2,train_time:20.7s, HR:0.2853, NDCG:0.2003, test_time:0.0s, loss:0.093239


 10%|█         | 2/20 [04:43<42:24, 141.37s/it]

epoch:2,train_time:20.7s, HR:0.3510, NDCG:0.2536, test_time:119.8s, loss:0.093239
epoch:3,train_time:20.6s, HR:0.3510, NDCG:0.2536, test_time:0.0s, loss:0.098189


 15%|█▌        | 3/20 [07:02<39:48, 140.52s/it]

epoch:3,train_time:20.6s, HR:0.3686, NDCG:0.2710, test_time:118.9s, loss:0.098189
epoch:4,train_time:20.4s, HR:0.3686, NDCG:0.2710, test_time:0.0s, loss:0.077938


 20%|██        | 4/20 [09:22<37:22, 140.13s/it]

epoch:4,train_time:20.4s, HR:0.3727, NDCG:0.2784, test_time:119.1s, loss:0.077938
epoch:5,train_time:20.6s, HR:0.3727, NDCG:0.2784, test_time:0.0s, loss:0.104349


 25%|██▌       | 5/20 [31:10<2:20:23, 561.55s/it]

epoch:5,train_time:20.6s, HR:0.3842, NDCG:0.2852, test_time:1288.2s, loss:0.104349
epoch:6,train_time:22.5s, HR:0.3842, NDCG:0.2852, test_time:0.0s, loss:0.090299


 30%|███       | 6/20 [33:28<1:37:25, 417.55s/it]

epoch:6,train_time:22.5s, HR:0.3803, NDCG:0.2859, test_time:115.5s, loss:0.090299
epoch:7,train_time:21.6s, HR:0.3803, NDCG:0.2859, test_time:0.0s, loss:0.093159


 35%|███▌      | 7/20 [35:44<1:10:28, 325.26s/it]

epoch:7,train_time:21.6s, HR:0.3876, NDCG:0.2884, test_time:113.6s, loss:0.093159
epoch:8,train_time:22.2s, HR:0.3876, NDCG:0.2884, test_time:0.0s, loss:0.090519


 40%|████      | 8/20 [38:04<53:15, 266.26s/it]  

epoch:8,train_time:22.2s, HR:0.3979, NDCG:0.2978, test_time:117.7s, loss:0.090519
epoch:9,train_time:19.9s, HR:0.3979, NDCG:0.2978, test_time:0.0s, loss:0.075464


 45%|████▌     | 9/20 [40:23<41:30, 226.45s/it]

epoch:9,train_time:19.9s, HR:0.3981, NDCG:0.2959, test_time:119.0s, loss:0.075464
epoch:10,train_time:20.7s, HR:0.3981, NDCG:0.2959, test_time:0.0s, loss:0.086165


 50%|█████     | 10/20 [42:44<33:20, 200.09s/it]

epoch:10,train_time:20.7s, HR:0.4018, NDCG:0.2998, test_time:120.4s, loss:0.086165
epoch:11,train_time:19.9s, HR:0.4018, NDCG:0.2998, test_time:0.0s, loss:0.086852


 55%|█████▌    | 11/20 [45:01<27:09, 181.05s/it]

epoch:11,train_time:19.9s, HR:0.4020, NDCG:0.2979, test_time:117.9s, loss:0.086852
epoch:12,train_time:22.2s, HR:0.4020, NDCG:0.2979, test_time:0.0s, loss:0.088448


 60%|██████    | 12/20 [47:20<22:23, 167.99s/it]

epoch:12,train_time:22.2s, HR:0.3981, NDCG:0.3019, test_time:115.9s, loss:0.088448
epoch:13,train_time:20.2s, HR:0.3981, NDCG:0.3019, test_time:0.0s, loss:0.098209


 65%|██████▌   | 13/20 [49:41<18:39, 159.87s/it]

epoch:13,train_time:20.2s, HR:0.3964, NDCG:0.2994, test_time:120.9s, loss:0.098209
epoch:14,train_time:21.0s, HR:0.3964, NDCG:0.2994, test_time:0.0s, loss:0.090917


 70%|███████   | 14/20 [52:00<15:22, 153.74s/it]

epoch:14,train_time:21.0s, HR:0.4008, NDCG:0.2966, test_time:118.6s, loss:0.090917
epoch:15,train_time:20.0s, HR:0.4008, NDCG:0.2966, test_time:0.0s, loss:0.085741


 75%|███████▌  | 15/20 [54:18<12:24, 148.82s/it]

epoch:15,train_time:20.0s, HR:0.3993, NDCG:0.3022, test_time:117.3s, loss:0.085741
epoch:16,train_time:21.7s, HR:0.3993, NDCG:0.3022, test_time:0.0s, loss:0.113931


 80%|████████  | 16/20 [56:35<09:40, 145.18s/it]

epoch:16,train_time:21.7s, HR:0.3964, NDCG:0.2979, test_time:115.0s, loss:0.113931
epoch:17,train_time:21.0s, HR:0.3964, NDCG:0.2979, test_time:0.0s, loss:0.077933


 85%|████████▌ | 17/20 [58:50<07:07, 142.41s/it]

epoch:17,train_time:21.0s, HR:0.3974, NDCG:0.2988, test_time:114.9s, loss:0.077933
epoch:18,train_time:21.1s, HR:0.3974, NDCG:0.2988, test_time:0.0s, loss:0.077106


 90%|█████████ | 18/20 [1:01:07<04:41, 140.74s/it]

epoch:18,train_time:21.1s, HR:0.3961, NDCG:0.2985, test_time:115.7s, loss:0.077106
epoch:19,train_time:21.0s, HR:0.3961, NDCG:0.2985, test_time:0.0s, loss:0.081155


 95%|█████████▌| 19/20 [1:03:23<02:19, 139.23s/it]

epoch:19,train_time:21.0s, HR:0.3981, NDCG:0.2985, test_time:114.7s, loss:0.081155
epoch:20,train_time:20.2s, HR:0.3981, NDCG:0.2985, test_time:0.0s, loss:0.104720


100%|██████████| 20/20 [1:05:38<00:00, 196.92s/it]

epoch:20,train_time:20.2s, HR:0.4020, NDCG:0.3019, test_time:114.6s, loss:0.104720
best epoch:11,HR:0.4020, NDCG:0.2979
The best model is saved to C:/Users/wangxin/Desktop/remmend_DKT_Apriori_DMF/pretrain//11.model



