In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

data_100k = 'ml-100k/u.data'

In [4]:
def cosSimilarity(matrix):
    similarity_matrix = cosine_similarity(matrix)
    print("Cosine Similarity Matrix Sample:")
    print(similarity_matrix[:5, :5])
    print("=" * 120)
    return similarity_matrix

In [5]:
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
    # Get the list of shared_items
    si={}
    for item in prefs[person1]: 
        if item in prefs[person2]: si[item]=1

    # if they have no ratings in common, return 0
    if len(si)==0: return 0

    # Add up the squares of all the differences
    sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2) 
                        for item in prefs[person1] if item in prefs[person2]])

    return 1/(1+sum_of_squares)

In [6]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]: 
        if item in prefs[p2]: si[item]=1

    # if they are no ratings in common, return 0
    if len(si)==0: return 0

    # Sum calculations
    n=len(si)
    
    # Sums of all the preferences
    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])
    
    # Sums of the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si]) 
    
    # Sum of the products
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    
    # Calculate r (Pearson score)
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0

    r=num/den

    return r

In [7]:
def loadData(test_size=0.2, datafile='ml-100k/u.data', header=['uid','iid','ratings','timestamp'], sep='\t', seed=0):
    # Read CSV File into A Pandas DataFrame
    df = pd.read_csv(datafile, header=None, names=header, sep=sep, engine='python')
    df.drop(columns='timestamp')
    print(df.head())
    # The Number of User and Items
    num_users, num_items = df[header[0]].unique().shape[0], df[header[1]].unique().shape[0]
    # The minimum id of user and item (because in Python array index is from 0)
    uid_min, iid_min = df['uid'].min(), df['iid'].min()

    # Train and Test Dataset Splitting
    train_df, test_df = train_test_split(np.asarray(df), test_size=test_size, random_state=seed)

    # Change the data structure into sparse matrix
    train = sp.csr_matrix((train_df[:, 2], (train_df[:, 0]-uid_min, train_df[:, 1]-iid_min)), shape=(num_users, num_items))
    test = sp.csr_matrix((test_df[:, 2], (test_df[:, 0]-uid_min, test_df[:, 1]-iid_min)), shape=(num_users, num_items))

    print("Number of Users: " + str(num_users))
    print("Number of Items: " + str(num_items))
    print("=" * 120)

    print("Sample Data: " + str(train.getrow(0).toarray()))
    print("=" * 120)

    return train, test


# def loadData(test_size=0.2, datafile='ml-100k/u.data', header=['uid','iid','ratings','timestamp'], sep='\t', seed=0):
#     # Read CSV File into A Pandas DataFrame
#     df = pd.read_csv(datafile, header=None, names=header, sep=sep, engine='python')
#     df = df.drop(columns='timestamp')
#     print(df.head())
#     # The Number of User and Items
#     num_users, num_items = df[header[0]].unique().shape[0], df[header[1]].unique().shape[0]
#     # The minimum id of user and item (because in Python array index is from 0)
#     uid_min, iid_min = df['uid'].min(), df['iid'].min()
#     uid_max, iid_max = df['uid'].max(), df['iid'].max()
# #     print(uid_max,iid_max)
# #     print(num_users)
#     result = np.zeros((num_users, num_items))
#     for index, row in df.iterrows():
#         result[int(row['uid'])-1,int(row['iid'])-1] = int(row['ratings'])
#     train, test = train_test_split(result, test_size=test_size, random_state = seed)
#     return train, test

In [8]:
def Precision_and_Recall(pred_item_list, test_item_list):
    # Calculate the Number of Occurrences of Testing Item IDs in the Prediction Item ID List
    sum_relevant_item = 0
    for item in test_item_list:
        if item in pred_item_list:
            sum_relevant_item += 1

    # Calculate the Precision and Recall Value
    precision = sum_relevant_item / len(pred_item_list)
    recall = sum_relevant_item / len(test_item_list)

    return precision, recall

In [9]:
train, test = loadData()
train = train.toarray()
test = test.toarray()
print(type(train))
print(train.shape)
print(test.shape)

   uid  iid  ratings  timestamp
0  196  242        3  881250949
1  186  302        3  891717742
2   22  377        1  878887116
3  244   51        2  880606923
4  166  346        1  886397596
Number of Users: 943
Number of Items: 1682
Sample Data: [[5 3 4 ... 0 0 0]]
<class 'numpy.ndarray'>
(943, 1682)
(943, 1682)


In [16]:
class SVD:
    def __init__(self,mat,K=20):
        self.mat=mat
        self.K=K
        self.bi={}
        self.bu={}
        self.qi={}
        self.pu={}
        self.avg=np.mean(self.mat[:,2])
        for i in range(self.mat.shape[0]):
            uid=self.mat[i,0]
            iid=self.mat[i,1]
            self.bi.setdefault(iid,0)
            self.bu.setdefault(uid,0)
            self.qi.setdefault(iid,np.random.random((self.K,1))/10*np.sqrt(self.K))
            self.pu.setdefault(uid,np.random.random((self.K,1))/10*np.sqrt(self.K))
    def predict(self,uid,iid):  #预测评分的函数
        #setdefault的作用是当该用户或者物品未出现过时，新建它的bi,bu,qi,pu，并设置初始值为0
        self.bi.setdefault(iid,0)
        self.bu.setdefault(uid,0)
        self.qi.setdefault(iid,np.zeros((self.K,1)))
        self.pu.setdefault(uid,np.zeros((self.K,1)))
        rating=self.avg+self.bi[iid]+self.bu[uid]+np.sum(self.qi[iid]*self.pu[uid]) #预测评分公式
        #由于评分范围在1到5，所以当分数大于5或小于1时，返回5,1.
        if rating>5:
            rating=5
        if rating<1:
            rating=1
        return rating
    def train(self,steps=30,gamma=0.04,Lambda=0.15):    #训练函数，step为迭代次数
        print('train data size',self.mat.shape)
        for step in range(steps):
            print('step',step+1,'is running')
            KK=np.random.permutation(self.mat.shape[0]) #随机梯度下降算法，kk为对矩阵进行随机洗牌
            rmse=0.0
            for i in range(self.mat.shape[0]):
                j=KK[i]
                uid=self.mat[j,0]
                iid=self.mat[j,1]
                rating=self.mat[j,2]
                eui=rating-self.predict(uid, iid)
                rmse+=eui**2
                self.bu[uid]+=gamma*(eui-Lambda*self.bu[uid])  
                self.bi[iid]+=gamma*(eui-Lambda*self.bi[iid])
                tmp=self.qi[iid]
                self.qi[iid]+=gamma*(eui*self.pu[uid]-Lambda*self.qi[iid])
                self.pu[uid]+=gamma*(eui*tmp-Lambda*self.pu[uid])
            gamma=0.93*gamma
            print('rmse is',np.sqrt(rmse/self.mat.shape[0]))
    def test(self,test_data):  #gamma以0.93的学习率递减
        test_data=test_data
        print('test data size',test_data.shape)
        rmse=0.0
        for i in range(test_data.shape[0]):
            uid=test_data[i,0]
            iid=test_data[i,1]
            rating=test_data[i,2]
            eui=rating-self.predict(uid, iid)
            rmse+=eui**2
        print('rmse of test data is',np.sqrt(rmse/test_data.shape[0]))


In [17]:
a=SVD(train,30)  
a.train()
a.test(test)

train data size (943, 1682)
step 1 is running
rmse is 1.2896900428768914
step 2 is running
rmse is 1.3263781124312921
step 3 is running
rmse is 1.3231762293966352
step 4 is running
rmse is 1.277507514924781
step 5 is running
rmse is 1.2641563187767908
step 6 is running
rmse is 1.2841310590754709
step 7 is running
rmse is 1.2675073023062975
step 8 is running
rmse is 1.2370217029346264
step 9 is running
rmse is 1.2741828313051424
step 10 is running
rmse is 1.2162735362377495
step 11 is running
rmse is 1.2022425140404402
step 12 is running
rmse is 1.2197560731756085
step 13 is running
rmse is 1.1844700279381388
step 14 is running
rmse is 1.2266914869851269
step 15 is running
rmse is 1.2162735362377495
step 16 is running
rmse is 1.2057655765789546
step 17 is running
rmse is 1.2266914869851269
step 18 is running
rmse is 1.2057655765789546
step 19 is running
rmse is 1.2057655765789546
step 20 is running
rmse is 1.1844700279381388
step 21 is running
rmse is 1.2057655765789546
step 22 is runni