In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from numpy import dot
from numpy.linalg import norm

In [2]:
#匯入user.movie互動資料
rating = pd.read_csv("Movielens\\user_movie.dat",sep = "\t",names=["userId","movieId","rating","timestamp"])
print("互動資料:")
rating

互動資料:


Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [3]:
#匯入電影+類型
movie = pd.read_csv("Movielens\\movie_genre.dat",sep = "\t",names=["movieId","genre"])
print("電影資料:")
movie

電影資料:


Unnamed: 0,movieId,genre
0,1,3
1,1,4
2,1,5
3,2,1
4,2,2
...,...,...
2886,1679,16
2887,1680,8
2888,1680,14
2889,1681,5


In [4]:
#user infor
age = pd.read_csv("Movielens\\user_age.dat",sep = "\t",names=["userId","age"])
occu = pd.read_csv("Movielens\\user_occupation.dat",sep = "\t",names=["userId","occupation"])
user = pd.merge(age[["userId","age"]], occu[["userId","occupation"]], on='userId')
user

Unnamed: 0,userId,age,occupation
0,1,3,1
1,2,6,2
2,3,3,3
3,4,3,1
4,5,4,2
...,...,...,...
938,939,3,6
939,940,4,5
940,941,3,6
941,942,5,12


In [5]:
#data filtering (沒有資料需要刪除)
for number in range(1,944):
    filt = (rating['userId'] == number)
    if len(rating.loc[filt])<3:
        print(number,"/",len(rating.loc[filt]))
        

In [6]:
total1 = pd.merge(rating[["userId","movieId","rating"]], movie[["movieId","genre"]], on='movieId')
total = pd.merge(total1[["userId","movieId","rating","genre"]], user[["userId","age","occupation"]], on='userId')
print("user+movie資訊合併:")
total

user+movie資訊合併:


Unnamed: 0,userId,movieId,rating,genre,age,occupation
0,196,242,3,5,5,3
1,196,257,2,1,5,3
2,196,257,2,2,5,3
3,196,257,2,5,5,3
4,196,257,2,15,5,3
...,...,...,...,...,...,...
212580,873,358,2,2,5,5
212581,873,358,2,15,5,5
212582,873,358,2,16,5,5
212583,873,342,4,5,5,5


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
#分割資料集
train_data, test_data = train_test_split(total, random_state=None, train_size=0.8)
print("Train data:")
train_data

Train data:


Unnamed: 0,userId,movieId,rating,genre,age,occupation
153360,308,402,4,5,7,19
72159,535,608,4,14,5,8
115278,353,313,5,1,3,9
29856,54,595,3,16,3,4
206209,242,305,5,8,4,8
...,...,...,...,...,...,...
80143,806,228,4,1,3,16
94683,256,323,5,16,4,17
21144,18,52,5,8,4,2
116052,595,1134,5,8,3,11


In [9]:
print("Test data:")
test_data

Test data:


Unnamed: 0,userId,movieId,rating,genre,age,occupation
79852,880,771,3,15,2,6
119958,215,517,5,14,4,11
55661,543,237,4,8,4,9
205593,754,273,3,1,6,12
30326,62,1133,4,2,3,5
...,...,...,...,...,...,...
184758,661,222,3,2,3,11
99426,437,476,4,5,3,2
11394,416,720,4,1,3,6
88006,5,90,3,5,4,2


In [10]:
#定義模型(Item-based "cosine")
def find_common_users(movie1,movie2,df):
    """Find users that both movies have watched"""
    s1 = set((df.loc[df["movieId"]==movie1,"userId"].values))
#     print(s1)
    s2 = set((df.loc[df["movieId"]==movie2,"userId"].values))
#     print(s2)
#     print(s1.intersection(s2))
    return s1.intersection(s2)


def cal_similarity_for_user_ratings(movie1,movie2,users_id,df,method):
    """Calculate the similarity for user ratings between movie1 and movie2"""
    u1 = df[df["movieId"]==movie1]
    u2 = df[df["movieId"]==movie2]
       
    vec1 = u1[u1.userId.isin(users_id)].sort_values(by="userId")["rating"].values

    vec2 = u2[u2.userId.isin(users_id)].sort_values(by="userId")["rating"].values

    i=1
    if len(vec1)>=len(vec2):
        for i in range(len(vec1)-len(vec2)):
            vec2= np.append(vec2,0)
            i=i+1
    else:
        for i in range(len(vec2)-len(vec1)):
            vec1= np.append(vec1,0)
            i=i+1

    if method=="cosine":        
        return dot(vec1, vec2)/(norm(vec1)*norm(vec2))

    return None

def find_the_most_similar_movies(movie, num, df,method):
    # Calculate the similarity between the movie and other movies
    similarities = []
    movie_ids = []
    for other_movie in df.movieId.unique():
        if other_movie == movie:
            continue
        
        common_users = find_common_users(movie,other_movie,df)
        if len(common_users)<10:
            sim = 0
        else:
            sim = cal_similarity_for_user_ratings(movie,other_movie,common_users,df,method)
        
        similarities.append(sim)
        movie_ids.append(other_movie)
#     print(similarities)
            
    # Find top n similar movies
    similarities,movie_ids = np.array(similarities),np.array(movie_ids)
    sorted_index = (np.argsort(similarities)[::-1][:num]).tolist()
    most_similar_movies = movie_ids[sorted_index]
    most_similar_movies_rat = similarities[sorted_index]
    return most_similar_movies,similarities


In [11]:
RMSEcu=[]
from sklearn.metrics import mean_squared_error
import math

In [12]:
#cosine

In [21]:
for number in range(1,21):
    movie = number
    num = 50
    similar_movies = find_the_most_similar_movies(movie,num,train_data,'cosine')
    print(f"movie: {movie}")
    print(f"The most similar movies: {similar_movies[0]}")
    pred = find_the_most_similar_movies(movie,num,test_data,'cosine')

    
    vec1=similar_movies[1]
    vec2=pred[1]
    if len(vec1)>=len(vec2):
        for i in range(len(vec1)-len(vec2)):
            vec2= np.append(vec2,0)
            i=i+1
    else:
        for i in range(len(vec2)-len(vec1)):
            vec1= np.append(vec1,0)
            i=i+1
            
 
            
    MSE = mean_squared_error(vec1,vec2)
    RMSE = math.sqrt(MSE)
    RMSEcu=np.append(RMSEcu,RMSE)


movie: 1
The most similar movies: [  28  526  399  408  651  187  207  142   99  332  588    8  313  195
  228  510   31  786  731   96  170  483   82   69  185   92    4  222
   72  515  936  566  127  649 1208  327 1037   73  422  435  751   22
  100  654   71  432  311  239  778  642]
movie: 2
The most similar movies: [ 483  596  489  549   22  879  394  422 1219  298  653 1016  930  356
   71  380   28  418  588   80  151  435  655  118  222  117  100  186
 1478  399   31  313  214  230  692  651    4   24  515   92  420   97
  501  332   62  408   73 1139  207  239]
movie: 3
The most similar movies: [ 952  116   59  317  975  447  425  708  137  193  864  729  845  448
  428  496  710  746  527  735  168  196 1011 1009 1073  179  443   87
   86  792  640 1039  203  470  340  282  518  134  192  272  959  401
  154  290  357  427  663  249   67   42]
movie: 4
The most similar movies: [1115   92  483  408  420  654  651   99  195  549   28    8  185  588
   73  653  356  566  670  4

In [22]:
Final=0
print(RMSEcu)
# RMSEcu[0]

[0.47901596 0.44915746 0.37916693 0.48825426 0.43130148 0.47901596
 0.44915746 0.37916693 0.48825426 0.43130148 0.08225772 0.53946228
 0.48131149 0.53349347 0.46262468 0.47901596 0.44915746 0.37916693
 0.48825426 0.43130148 0.08225772 0.53946228 0.48131149 0.53349347
 0.46262468 0.53311778 0.54541061 0.49127151 0.50985858 0.5044743
 0.28665457 0.35092309 0.         0.31361948 0.41869579]


In [23]:
for number in range(0,20):
    Final= Final+RMSEcu[number]
#     print(Final)
    
Final=Final/20   
print("Root Mean Square Error:", Final)

Root Mean Square Error: 0.43899189497844293


In [56]:
#推薦movies
def recommend(df,movie,similar_movies ,top_n):
    
    seen_users = np.unique(df.loc[df["movieId"]==movie,"userId"].values)
    Seen_users = df["movieId"].isin(seen_users)
    Seen_users_ratings = df[Seen_users][["movieId","rating"]]
    
    # Find the movies the similar users have seen.
    similar_cond = df["movieId"].isin(similar_movies)
    seen_movies_ratings = df[similar_cond][["movieId","rating"]]
#     print(seen_movies_ratings)

    # Find average ratings by the most similar users
    average_ratings = seen_movies_ratings.groupby("movieId").mean()
    average_ratings.reset_index(inplace=True)
    top_ratings = average_ratings.sort_values(by="rating",ascending=False).iloc[:top_n]
    top_ratings.reset_index(inplace=True,drop=True)
    
    Top_ratings = pd.merge(top_ratings[["movieId","rating"]], seen_movies_ratings[["movieId","rating"]], on='movieId')
    
    return Top_ratings

In [57]:
top_n = 10
top_ratings = recommend(train_data,movie,similar_movies[0], top_n)
print(f"Top-{top_n} average ratings by the most similar movies:")
top_ratings

Top-10 average ratings by the most similar movies:


Unnamed: 0,movieId,rating_x,rating_y
0,98,4.298246,5
1,98,4.298246,5
2,98,4.298246,4
3,98,4.298246,5
4,98,4.298246,4
...,...,...,...
3166,647,4.132743,5
3167,647,4.132743,4
3168,647,4.132743,5
3169,647,4.132743,3


In [58]:
pred_top_ratings=top_ratings[["rating_x","rating_y"]]

In [60]:
pred_top_ratings.sort_values(by='rating_x').tail(30)

Unnamed: 0,rating_x,rating_y
403,4.298246,4
401,4.298246,4
400,4.298246,4
399,4.298246,4
398,4.298246,1
397,4.298246,5
396,4.298246,4
395,4.298246,5
402,4.298246,4
393,4.298246,5


In [50]:
# sort the dataframe
pred_top_ratings.sort_values(by='rating_x',ascending=False,inplace=True)
# remove rows with missing values
pred_top_ratings.dropna(inplace=True)
# convert ratings to binary labels
threshold = 3.5
pred_top_ratings = pred_top_ratings >= threshold
# view results
pred_top_ratings.head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,rating_x,rating_y
0,True,True
423,True,True
416,True,True
417,True,True
418,True,True
419,True,True
420,True,True
421,True,True
422,True,True
424,True,True


In [53]:
def recall_at_k(df: pd.DataFrame, k, y_test: str='rating_y', y_pred: str='rating_x') -> float:
    """
    Function to compute recall@k for an input boolean dataframe
    
    Inputs:
        df     -> pandas dataframe containing boolean columns y_test & y_pred
        k      -> integer number of items to consider
        y_test -> string name of column containing actual user input
        y-pred -> string name of column containing recommendation output
        
    Output:
        Floating-point number of recall value for k items
    """
    # check we have a valid entry for k
    if k <= 0:
        raise ValueError('Value of k should be greater than 1, read in as: {}'.format(k))
    # check y_test & y_pred columns are in df
    if y_test not in df.columns:
        raise ValueError('Input dataframe does not have a column named: {}'.format(y_test))
    if y_pred not in df.columns:
        raise ValueError('Input dataframe does not have a column named: {}'.format(y_pred))
        
    # extract the k rows
    dfK = df.head(k)
    # compute number of all relevant items
    denominator = df[y_test].sum()
    # compute number of recommended items that are relevant @k
    numerator = dfK[dfK[y_pred] & dfK[y_test]].shape[0]
    # return result
    if denominator > 0:
        return numerator/denominator
    else:
        return None

In [54]:
k = 10
print('Recall@k: {:.2f} for k={}'.format(recall_at_k(pred_top_ratings,k),k))

Recall@k: 0.00 for k=10


In [61]:
from sklearn.metrics import ndcg_score

In [62]:
C=[[4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982,4.2982]]
D=[[5,4,4,4,5,4,4,5,4,5,1,5,1,5,4,4,4,5,5,5,5,4,5,4,5,1,4,4,4,4]]

In [63]:
ndcg_score(C,D, k=10)

1.0000000000000002