In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

data_100k = 'ml-100k/u.data'

In [2]:
def cosSimilarity(matrix):
    similarity_matrix = cosine_similarity(matrix)
    print("Cosine Similarity Matrix Sample:")
    print(similarity_matrix[:5, :5])
    print("=" * 120)
    return similarity_matrix

In [3]:
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
    # Get the list of shared_items
    si={}
    for item in prefs[person1]: 
        if item in prefs[person2]: si[item]=1

    # if they have no ratings in common, return 0
    if len(si)==0: return 0

    # Add up the squares of all the differences
    sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2) 
                        for item in prefs[person1] if item in prefs[person2]])

    return 1/(1+sum_of_squares)

In [4]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]: 
        if item in prefs[p2]: si[item]=1

    # if they are no ratings in common, return 0
    if len(si)==0: return 0

    # Sum calculations
    n=len(si)
    
    # Sums of all the preferences
    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])
    
    # Sums of the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si]) 
    
    # Sum of the products
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    
    # Calculate r (Pearson score)
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0

    r=num/den

    return r

In [5]:
def loadData(test_size=0.2, datafile='ml-100k/u.data', header=['uid','iid','ratings','timestamp'], sep='\t', seed=0):
    # Read CSV File into A Pandas DataFrame
    df = pd.read_csv(datafile, header=None, names=header, sep=sep, engine='python')
    df.drop(columns='timestamp')
    print(df.head())
    # The Number of User and Items
    num_users, num_items = df[header[0]].unique().shape[0], df[header[1]].unique().shape[0]
    # The minimum id of user and item (because in Python array index is from 0)
    uid_min, iid_min = df['uid'].min(), df['iid'].min()

    # Train and Test Dataset Splitting
    train_df, test_df = train_test_split(np.asarray(df), test_size=test_size, random_state=seed)

    # Change the data structure into sparse matrix
    train = sp.csr_matrix((train_df[:, 2], (train_df[:, 0]-uid_min, train_df[:, 1]-iid_min)), shape=(num_users, num_items))
    test = sp.csr_matrix((test_df[:, 2], (test_df[:, 0]-uid_min, test_df[:, 1]-iid_min)), shape=(num_users, num_items))

    print("Number of Users: " + str(num_users))
    print("Number of Items: " + str(num_items))
    print("=" * 120)

    print("Sample Data: " + str(train.getrow(0).toarray()))
    print("=" * 120)

    return train, test


# def loadData(test_size=0.2, datafile='ml-100k/u.data', header=['uid','iid','ratings','timestamp'], sep='\t', seed=0):
#     # Read CSV File into A Pandas DataFrame
#     df = pd.read_csv(datafile, header=None, names=header, sep=sep, engine='python')
#     df = df.drop(columns='timestamp')
#     print(df.head())
#     # The Number of User and Items
#     num_users, num_items = df[header[0]].unique().shape[0], df[header[1]].unique().shape[0]
#     # The minimum id of user and item (because in Python array index is from 0)
#     uid_min, iid_min = df['uid'].min(), df['iid'].min()
#     uid_max, iid_max = df['uid'].max(), df['iid'].max()
# #     print(uid_max,iid_max)
# #     print(num_users)
#     result = np.zeros((num_users, num_items))
#     for index, row in df.iterrows():
#         result[int(row['uid'])-1,int(row['iid'])-1] = int(row['ratings'])
#     train, test = train_test_split(result, test_size=test_size, random_state = seed)
#     return train, test

In [6]:
def Precision_and_Recall(pred_item_list, test_item_list):
    # Calculate the Number of Occurrences of Testing Item IDs in the Prediction Item ID List
    sum_relevant_item = 0
    for item in test_item_list:
        if item in pred_item_list:
            sum_relevant_item += 1

    # Calculate the Precision and Recall Value
    precision = sum_relevant_item / len(pred_item_list)
    recall = sum_relevant_item / len(test_item_list)

    return precision, recall

In [7]:
train, test = loadData()
train = train.toarray()
test = test.toarray()
print(type(train))
print(train.shape)
print(test.shape)

   uid  iid  ratings  timestamp
0  196  242        3  881250949
1  186  302        3  891717742
2   22  377        1  878887116
3  244   51        2  880606923
4  166  346        1  886397596
Number of Users: 943
Number of Items: 1682
Sample Data: [[5 3 4 ... 0 0 0]]
<class 'numpy.ndarray'>
(943, 1682)
(943, 1682)


In [8]:
class SVD:
    def __init__(self,mat,K=20):
        self.mat=mat
        self.K=K
        self.bi={}
        self.bu={}
        self.qi={}
        self.pu={}
        self.avg=np.mean(self.mat[:,2])
        for i in range(self.mat.shape[0]):
            uid=self.mat[i,0]
            iid=self.mat[i,1]
            self.bi.setdefault(iid,0)
            self.bu.setdefault(uid,0)
            self.qi.setdefault(iid,np.random.random((self.K,1))/10*np.sqrt(self.K))
            self.pu.setdefault(uid,np.random.random((self.K,1))/10*np.sqrt(self.K))
    def predict(self,uid,iid):  #预测评分的函数
        #setdefault的作用是当该用户或者物品未出现过时，新建它的bi,bu,qi,pu，并设置初始值为0
        self.bi.setdefault(iid,0)
        self.bu.setdefault(uid,0)
        self.qi.setdefault(iid,np.zeros((self.K,1)))
        self.pu.setdefault(uid,np.zeros((self.K,1)))
        rating=self.avg+self.bi[iid]+self.bu[uid]+np.sum(self.qi[iid]*self.pu[uid]) #预测评分公式
        #由于评分范围在1到5，所以当分数大于5或小于1时，返回5,1.
        if rating>5:
            rating=5
        if rating<1:
            rating=1
        return rating
    def train(self,steps=30,gamma=0.04,Lambda=0.15):    #训练函数，step为迭代次数
        print('train data size',self.mat.shape)
        for step in range(steps):
            print('step',step+1,'is running')
            KK=np.random.permutation(self.mat.shape[0]) #随机梯度下降算法，kk为对矩阵进行随机洗牌
            rmse=0.0
            for i in range(self.mat.shape[0]):
                j=KK[i]
                uid=self.mat[j,0]
                iid=self.mat[j,1]
                rating=self.mat[j,2]
                eui=rating-self.predict(uid, iid)
                rmse+=eui**2
                self.bu[uid]+=gamma*(eui-Lambda*self.bu[uid])  
                self.bi[iid]+=gamma*(eui-Lambda*self.bi[iid])
                tmp=self.qi[iid]
                self.qi[iid]+=gamma*(eui*self.pu[uid]-Lambda*self.qi[iid])
                self.pu[uid]+=gamma*(eui*tmp-Lambda*self.pu[uid])
            gamma=0.93*gamma
            print('rmse is',np.sqrt(rmse/self.mat.shape[0]))
    def test(self,test_data):  #gamma以0.93的学习率递减
        test_data=test_data
        print('test data size',test_data.shape)
        rmse=0.0
        for i in range(test_data.shape[0]):
            uid=test_data[i,0]
            iid=test_data[i,1]
            rating=test_data[i,2]
            eui=rating-self.predict(uid, iid)
            rmse+=eui**2
        print('rmse of test data is',np.sqrt(rmse/test_data.shape[0]))


In [31]:
# from utils import *
import warnings; warnings.simplefilter('ignore')

class MF(object):
    def __init__(self,train_matrix,test_matrix):
        self.num_factors = 600 # Dimension of the Latent Factor
        self.regs = 1e-3 # Regularizer Coefficient
        self.lr = 0.01 # Learning Rate
        self.epochs = 50 # How many number of Training Loops
        self.batch_size = 128 # How many data is fed into the training algorithm in each epoch
        # Initialize Parameters
        self.num_factors = num_factors 
        self.regs = regularizer 

        self.lr = learning_rate 
        self.epochs = epochs 
        self.batch_size = batch_size 

        self.num_user, self.num_item = train_matrix.shape[0], train_matrix.shape[1]
        # Store the user IDs in a list, the item IDs in a list and the ratings in a list
        train_matrix, test_matrix = train_matrix.tocoo(), test_matrix.tocoo()
        self.train_uid, self.train_iid, self.train_ratings = list(train_matrix.row),list(train_matrix.col),list(train_matrix.data)
        self.test_uid, self.test_iid, self.test_ratings = list(test_matrix.row),list(test_matrix.col),list(test_matrix.data)

        # Calculate the average of all ratings (the mu value in the equation)
        self.mu = np.mean(self.train_ratings)

        # Total number of training data instances
        self.num_training = len(self.train_ratings)

        # Number of batches
        self.num_batch = int(self.num_training / self.batch_size)
        print("Data Preparation Completed.")

    # Build the model for customized SGD algorithm
        # Initialize all the parameters (Use Normal Distribution)
        # bu and bi are vectors (Note the dimension)
        self.bu = np.random.normal(scale = 1. / self.num_factors, size=[self.num_user])
        self.bi = np.random.normal(scale = 1. / self.num_factors, size=[self.num_item])

        # P and Q are matrices (Note the dimension)
        self.P = np.random.normal(scale=1. / self.num_factors, size=[self.num_user, self.num_factors])
        self.Q = np.random.normal(scale=1. / self.num_factors, size=[self.num_factors, self.num_item])
        print("Parameter Initialization Completed.")

    # Training using SGD algorithm
    def train_and_evaluate(self):
        for epoch in range(self.epochs):
            for uid, iid, ratings in list(zip(self.train_uid, self.train_iid, self.train_ratings)):
                # The estimated rating
                pred_r = self.mu + self.bu[uid] + self.bi[iid] + np.dot(self.Q[:, iid], self.P[uid, :])

                # Calculate the loss of this specific user-item pair
                error = ratings - pred_r

                # Update the parameters
                self.bu[uid] = self.bu[uid] + self.lr * (error - self.regs * self.bu[uid])
                self.bi[iid] = self.bi[iid] + self.lr * (error - self.regs * self.bi[iid])
                self.P[uid, :] = self.P[uid, :] + self.lr * (error * self.Q[:, iid] - self.regs * self.P[uid, :])
                self.Q[:, iid] = self.Q[:, iid] + self.lr * (error * self.P[uid, :] - self.regs * self.Q[:, iid])
            rms_test_list, rms_train_list = [], []

            for uid, iid, ratings in list(zip(self.test_uid, self.test_iid, self.test_ratings)):
                rms_test_list.append(
                    (self.mu + self.bu[uid] + self.bi[iid] + np.dot(self.Q[:, iid], self.P[uid, :]) - ratings) ** 2)

            for uid, iid, ratings in list(zip(self.train_uid, self.train_iid, self.train_ratings)):
                rms_train_list.append(
                    (self.mu + self.bu[uid] + self.bi[iid] + np.dot(self.Q[:, iid], self.P[uid, :]) - ratings) ** 2)

            rms_test = np.sqrt(np.mean(rms_test_list))
            rms_train = np.sqrt(np.mean(rms_train_list))
            print("Epoch {0} Training: [RMS] {1} and Testing: [RMS] {2}".format(epoch, rms_train, rms_test))


In [32]:
if __name__ == "__main__":
    train_matrix, test_matrix = loadData()
    model = MF(train_matrix, test_matrix)
    model.train_and_evaluate()

   uid  iid  ratings  timestamp
0  196  242        3  881250949
1  186  302        3  891717742
2   22  377        1  878887116
3  244   51        2  880606923
4  166  346        1  886397596
Number of Users: 943
Number of Items: 1682
Sample Data: [[5 3 4 ... 0 0 0]]
Data Preparation Completed.
Parameter Initialization Completed.
Epoch 0 Training: [RMS] 0.9709943991077488 and Testing: [RMS] 0.9847204477041067
Epoch 1 Training: [RMS] 0.947017208004226 and Testing: [RMS] 0.9648437886547285
Epoch 2 Training: [RMS] 0.9362230037195872 and Testing: [RMS] 0.9564791473300163


KeyboardInterrupt: 

In [45]:
def svd(mat, feature, steps=50, gama=0.02, lamda=0.3):
    slowRate = 0.99
    preRmse = 1000000000.0
    nowRmse = 0.0
 
    user_feature = np.matrix(np.random.rand(mat.shape[0], feature))
    item_feature = np.matrix(np.random.rand(mat.shape[1], feature))
 
    for step in range(steps):
        rmse = 0.0  
        n = 0  
        for u in range(mat.shape[0]):
            for i in range(mat.shape[1]):
                if not np.isnan(mat[u,i]):
                    pui = float(np.dot(user_feature[u,:], item_feature[i,:].T))
                    eui = mat[u,i] - pui
                    rmse += pow(eui, 2)
                    n += 1 
                    for k in range(feature):
                        user_feature[u,k] += gama*(eui*item_feature[i,k] - lamda*user_feature[u,k])
                        item_feature[i,k] += gama*(eui*user_feature[u,k] - lamda*item_feature[i,k]) # 原blog这里有错误 
 
        nowRmse = np.sqrt(rmse * 1.0 / n)
        print('step: %d      Rmse: %s' % ((step+1), nowRmse))
        if (nowRmse < preRmse):  
            preRmse = nowRmse
        else:
            break # 这个退出条件其实还有点问题
        gama *= slowRate
        step += 1
 
    return user_feature, item_feature

In [46]:
user_feature, item_feature = svd(train, 5)

step: 1      Rmse: 0.7525733265976747
step: 2      Rmse: 0.7644989532969478


matrix([[2.94060540e-05, 5.84556900e-05, 2.89827418e-05, 4.91040562e-05,
         3.49958660e-05],
        [4.23850628e-05, 2.96976943e-05, 6.28733378e-05, 3.68993543e-05,
         4.24204682e-05],
        [3.55896979e-05, 2.68984183e-05, 5.31611075e-05, 2.70131844e-05,
         3.37341240e-05],
        ...,
        [1.89295819e-04, 2.08726432e-04, 1.78494852e-04, 2.08848853e-04,
         1.83884924e-04],
        [7.73505190e-04, 8.63899001e-04, 7.27288939e-04, 8.62449982e-04,
         7.53749646e-04],
        [9.53059890e-04, 1.06275134e-03, 8.95077651e-04, 1.06019908e-03,
         9.26962730e-04]])