In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

ratings = pd.read_csv('./ml-latest-small/ml-latest-small/ratings.csv',index_col=None)
ratings.describe()
ratings.head(5)

movies = pd.read_csv('./ml-latest-small/ml-latest-small/movies.csv',index_col=None)
movies.head(5)

data = pd.merge(ratings,movies,on='movieId')

rating_count_by_movie = data.groupby(['movieId','title'],as_index=False)['rating'].count()
rating_count_by_movie.columns=['movieId','title','rating_count']
rating_count_by_movie.sort_values(by=['rating_count'],ascending=False,inplace=True)
rating_count_by_movie[:10]

rating_stddev = data.groupby(['movieId','title']).agg({'rating':['mean','std']})
rating_stddev.head(10)

moviesPath = './ml-latest-small/ml-latest-small/movies.csv'

ratingsPath = './ml-latest-small/ml-latest-small/ratings.csv'

moviesDF = pd.read_csv(moviesPath,index_col=None)
ratingsDF = pd.read_csv(ratingsPath,index_col=None)

trainRatingsDF,testRatingsDF = train_test_split(ratingsDF,test_size=0.2)
print("total_movie_count:"+str(len(set(ratingsDF['movieId'].values.tolist()))))
print("total_user_count:" + str(len(set(ratingsDF['userId'].values.tolist()))))
print("train_movie_count:" + str(len(set(trainRatingsDF['movieId'].values.tolist()))))
print("train_user_count:" + str(len(set(trainRatingsDF['userId'].values.tolist()))))
print("test_movie_count:" + str(len(set(testRatingsDF['movieId'].values.tolist()))))
print("test_user_count:" + str(len(set(testRatingsDF['userId'].values.tolist()))))

trainRatingsPivotDF = pd.pivot_table(trainRatingsDF[['userId','movieId','rating']],columns=['movieId'],index=['userId'],values='rating',fill_value=0)

moviesMap = dict(enumerate(list(trainRatingsPivotDF.columns)))

usersMap = dict(enumerate(list(trainRatingsPivotDF.index)))

ratingValues = trainRatingsPivotDF.values.tolist()
ratingValues[0]

total_movie_count:9724
total_user_count:610
train_movie_count:9028
train_user_count:610
test_movie_count:5106
test_user_count:608


[4.0,
 0.0,
 4.0,
 0.0,
 0.0,
 4.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 5.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 4.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 5.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 5.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [None]:
def calCosineSimilarity(list1,list2):
    res = 0
    denominator1 = 0
    denominator2 = 0
    for (val1,val2) in zip(list1,list2):
        res += (val1 * val2)
        denominator1 += val1 ** 2
        denominator2 += val2 ** 2
        
userSimMatrix = np.zeros((len(ratingValues),len(ratingValues)),dtype=np.float32)
for i in range(len(ratingValues)-1):    
    for j in range(i+1,len(ratingValues)):
        userSimMatrix[i,j] = calCosineSimilarity(ratingValues[i],ratingValues[j])
        userSimMatrix[j,i] = userSimMatrix[i,j]

userMostSimDict = dict()
for i in range(len(ratingValues)):
    userMostSimDict[i] = sorted(enumerate(list(userSimMatrix[0])),key = lambda x:x[1],reverse=True)[:10]
        
userRecommendValues = np.zeros((len(ratingValues),len(ratingValues[0])),dtype=np.float32)


In [2]:
for i in range(len(ratingValues)):
    for j in range(len(ratingValues[i])):
            if ratingValues[i][j] == 0:
                val = 0            
                for (user,sim) in userMostSimDict[i]:
                    val += (ratingValues[user][j] * sim)
                userRecommendValues[i,j] = val

userRecommendDict = dict()
for i in range(len(ratingValues)):
    userRecommendDict[i] = sorted(enumerate(list(userRecommendValues[i])),key = lambda x:x[1],reverse=True)[:10]
    
userRecommendList = []
for key,value in userRecommendDict.items():
    user = usersMap[key]    
    for (movieId,val) in value:
        userRecommendList.append([user,moviesMap[movieId]])
        
recommendDF = pd.DataFrame(userRecommendList,columns=['userId','movieId'])
recommendDF = pd.merge(recommendDF,moviesDF[['movieId','title']],on='movieId',how='inner')
recommendDF.tail(10)

Unnamed: 0,userId,movieId,title
6090,601,10,GoldenEye (1995)
6091,602,10,GoldenEye (1995)
6092,603,10,GoldenEye (1995)
6093,604,10,GoldenEye (1995)
6094,605,10,GoldenEye (1995)
6095,606,10,GoldenEye (1995)
6096,607,10,GoldenEye (1995)
6097,608,10,GoldenEye (1995)
6098,609,10,GoldenEye (1995)
6099,610,10,GoldenEye (1995)


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

ratings = pd.read_csv('./ml-latest-small/ml-latest-small/ratings.csv',index_col=None)
ratings.describe()
ratings.head(5)

movies = pd.read_csv('./ml-latest-small/ml-latest-small/movies.csv',index_col=None)
movies.head(5)

data = pd.merge(ratings,movies,on='movieId')

rating_count_by_movie = data.groupby(['movieId','title'],as_index=False)['rating'].count()
rating_count_by_movie.columns=['movieId','title','rating_count']
rating_count_by_movie.sort_values(by=['rating_count'],ascending=False,inplace=True)
rating_count_by_movie[:10]

rating_stddev = data.groupby(['movieId','title']).agg({'rating':['mean','std']})
rating_stddev.head(10)

moviesPath = './ml-latest-small/ml-latest-small/movies.csv'

ratingsPath = './ml-latest-small/ml-latest-small/ratings.csv'

moviesDF = pd.read_csv(moviesPath,index_col=None)
ratingsDF = pd.read_csv(ratingsPath,index_col=None)

trainRatingsDF,testRatingsDF = train_test_split(ratingsDF,test_size=0.2)
print("total_movie_count:"+str(len(set(ratingsDF['movieId'].values.tolist()))))
print("total_user_count:" + str(len(set(ratingsDF['userId'].values.tolist()))))
print("train_movie_count:" + str(len(set(trainRatingsDF['movieId'].values.tolist()))))
print("train_user_count:" + str(len(set(trainRatingsDF['userId'].values.tolist()))))
print("test_movie_count:" + str(len(set(testRatingsDF['movieId'].values.tolist()))))
print("test_user_count:" + str(len(set(testRatingsDF['userId'].values.tolist()))))

trainRatingsPivotDF = pd.pivot_table(trainRatingsDF[['userId','movieId','rating']],columns=['movieId'],index=['userId'],values='rating',fill_value=0)

moviesMap = dict(enumerate(list(trainRatingsPivotDF.columns)))
moviesMap

total_movie_count:9724
total_user_count:610
train_movie_count:9000
train_user_count:610
test_movie_count:5105
test_user_count:610


{0: ('rating', 1),
 1: ('rating', 2),
 2: ('rating', 3),
 3: ('rating', 4),
 4: ('rating', 5),
 5: ('rating', 6),
 6: ('rating', 7),
 7: ('rating', 8),
 8: ('rating', 9),
 9: ('rating', 10),
 10: ('rating', 11),
 11: ('rating', 12),
 12: ('rating', 13),
 13: ('rating', 14),
 14: ('rating', 15),
 15: ('rating', 16),
 16: ('rating', 17),
 17: ('rating', 18),
 18: ('rating', 19),
 19: ('rating', 20),
 20: ('rating', 21),
 21: ('rating', 22),
 22: ('rating', 23),
 23: ('rating', 24),
 24: ('rating', 25),
 25: ('rating', 26),
 26: ('rating', 27),
 27: ('rating', 28),
 28: ('rating', 29),
 29: ('rating', 30),
 30: ('rating', 31),
 31: ('rating', 32),
 32: ('rating', 34),
 33: ('rating', 36),
 34: ('rating', 38),
 35: ('rating', 39),
 36: ('rating', 41),
 37: ('rating', 42),
 38: ('rating', 43),
 39: ('rating', 44),
 40: ('rating', 45),
 41: ('rating', 46),
 42: ('rating', 47),
 43: ('rating', 48),
 44: ('rating', 49),
 45: ('rating', 50),
 46: ('rating', 52),
 47: ('rating', 53),
 48: ('rati