In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from numpy import dot
from numpy.linalg import norm

In [2]:
#匯入user.movie互動資料
rating = pd.read_csv("Movielens\\user_movie.dat",sep = "\t",names=["userId","movieId","rating","timestamp"])
print("互動資料:")
rating

互動資料:


Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [3]:
#匯入電影+類型
movie = pd.read_csv("Movielens\\movie_genre.dat",sep = "\t",names=["movieId","genre"])
print("電影資料:")
movie

電影資料:


Unnamed: 0,movieId,genre
0,1,3
1,1,4
2,1,5
3,2,1
4,2,2
...,...,...
2886,1679,16
2887,1680,8
2888,1680,14
2889,1681,5


In [4]:
#user infor
age = pd.read_csv("Movielens\\user_age.dat",sep = "\t",names=["userId","age"])
occu = pd.read_csv("Movielens\\user_occupation.dat",sep = "\t",names=["userId","occupation"])
user = pd.merge(age[["userId","age"]], occu[["userId","occupation"]], on='userId')
user

Unnamed: 0,userId,age,occupation
0,1,3,1
1,2,6,2
2,3,3,3
3,4,3,1
4,5,4,2
...,...,...,...
938,939,3,6
939,940,4,5
940,941,3,6
941,942,5,12


In [5]:
#data filtering (沒有資料需要刪除)
for number in range(1,944):
    filt = (rating['userId'] == number)
    if len(rating.loc[filt])<3:
        print(number,"/",len(rating.loc[filt]))
        

In [6]:
total1 = pd.merge(rating[["userId","movieId","rating"]], movie[["movieId","genre"]], on='movieId')
total = pd.merge(total1[["userId","movieId","rating","genre"]], user[["userId","age","occupation"]], on='userId')
print("user+movie資訊合併:")
total

user+movie資訊合併:


Unnamed: 0,userId,movieId,rating,genre,age,occupation
0,196,242,3,5,5,3
1,196,257,2,1,5,3
2,196,257,2,2,5,3
3,196,257,2,5,5,3
4,196,257,2,15,5,3
...,...,...,...,...,...,...
212580,873,358,2,2,5,5
212581,873,358,2,15,5,5
212582,873,358,2,16,5,5
212583,873,342,4,5,5,5


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
#分割資料集
train_data, test_data = train_test_split(total, random_state=None, train_size=0.8)
print("Train data:")
train_data

Train data:


Unnamed: 0,userId,movieId,rating,genre,age,occupation
200453,75,508,4,8,3,10
14287,207,787,3,5,4,16
70310,896,801,2,5,3,3
80678,846,209,4,8,3,7
148742,650,176,4,16,5,15
...,...,...,...,...,...,...
179722,361,655,3,8,3,6
145123,694,229,4,2,7,11
16753,6,508,3,8,5,4
2490,354,50,4,1,3,12


In [9]:
print("Test data:")
test_data

Test data:


Unnamed: 0,userId,movieId,rating,genre,age,occupation
16260,249,403,4,1,3,6
137607,840,14,5,14,4,14
178604,262,423,4,15,2,6
11687,416,738,2,5,3,6
109264,922,432,5,3,3,5
...,...,...,...,...,...,...
132846,387,659,4,5,4,10
94451,152,71,5,12,4,8
7103,144,1142,5,7,6,11
19279,753,98,5,16,6,20


In [10]:
print(len(train_data), len(test_data))

170068 42517


In [11]:
#定義模型(User-based "cosine")
def find_common_movies(user1,user2,df):
    """Find movies that both users have watched"""
    s1 = set((df.loc[df["userId"]==user1,"movieId"].values))
#     print(s1)
    s2 = set((df.loc[df["userId"]==user2,"movieId"].values))
#     print(s2)
#     print(s1.intersection(s2))
    return s1.intersection(s2)


def cal_similarity_for_movie_ratings(user1,user2,movies_id,df,method):
    """Calculate the similarity for movie ratings between user1 and user2"""
    u1 = df[df["userId"]==user1]
    u2 = df[df["userId"]==user2]
       
    vec1 = u1[u1.movieId.isin(movies_id)].sort_values(by="movieId")["rating"].values

    vec2 = u2[u2.movieId.isin(movies_id)].sort_values(by="movieId")["rating"].values

    i=1
    if len(vec1)>=len(vec2):
        for i in range(len(vec1)-len(vec2)):
            vec2= np.append(vec2,0)
            i=i+1
    else:
        for i in range(len(vec2)-len(vec1)):
            vec1= np.append(vec1,0)
            i=i+1

    if method=="cosine":        
        return dot(vec1, vec2)/(norm(vec1)*norm(vec2))
    elif method =="PCC":
        return np.corrcoef(vec1, vec2)
    return None

def find_the_most_similar_users(user, num, df,method):
    # Calculate the similarity between the user and other users
    similarities = []
    user_ids = []
    for other_user in df.userId.unique():
        if other_user == user:
            continue
        
        common_movies = find_common_movies(user,other_user,df)
        if len(common_movies)<10:
            sim = 0
        else:
            sim = cal_similarity_for_movie_ratings(user,other_user,common_movies,df,method)
        
        similarities.append(sim)
        user_ids.append(other_user)
#     print(similarities)
            
    # Find top n similar users
    similarities,user_ids = np.array(similarities),np.array(user_ids)
    sorted_index = (np.argsort(similarities)[::-1][:num]).tolist()
    most_similar_users = user_ids[sorted_index]
    most_similar_users_rat = similarities[sorted_index]
#     print(most_similar_users_rat)
    return most_similar_users,similarities



In [12]:
RMSEcu=[]
from sklearn.metrics import mean_squared_error
import math

In [13]:
#cosine

In [18]:
for number in range(1,21):
    user = number
    num = 50
    similar_users = find_the_most_similar_users(user,num,train_data,'cosine')
    print(f"user: {user}")
    print(f"The most similar users: {similar_users[0]}")
    pred = find_the_most_similar_users(user,num,test_data,'cosine')
#     print(f"pred_user: {user}")
#     print(f"The most similar users: {pred[0]}")

    MSE = mean_squared_error(similar_users[1],pred[1])
    RMSE = math.sqrt(MSE)
    RMSEcu=np.append(RMSEcu,RMSE)

user: 1
The most similar users: [785 118 583 433 165 767 665 468 135 645 629 138 257 829 565 849 419  96
 700 275 680 117 703   8 530 861 748 292 579 686 473 259 237 122 522 844
  51 411 210  60 838  69  10 664 738 334 694 860 121 251]
user: 2
The most similar users: [892 540 184  18 534 621 332 216 291 285 582 360 864 664 803 916 379  81
 906 674 484 599 330 152 465 323 550 693 402 345 111 482 665 915   7 324
 605 121 891 298 574 878 673 249 554 629  74 213 934 838]
user: 3
The most similar users: [145 791  91 387 880 864 435 299 332 422 282 588 902 533 276  33 608 750
 788 816 273 130 697  43 144 907 143 317 173 724 323 507 832 328 294 875
 551 155 801   4 757 249 579 466 113 784 177 775  54 197]
user: 4
The most similar users: [616 710  43 548 429 749 332 345 486 459 363 668 145 334 464 268 294 328
 889 750 130 919 276 116 250 666 624 853 177 435 854 102 587 178 655 297
 871 119 327 592 529 663 683 234 758 222 416 717   7 488]
user: 5
The most similar users: [138 689 867 584  14 530

In [19]:
Final=0
print(RMSEcu)
# RMSEcu[0]

[0.66461121 0.67780539 0.44440614 0.2836182  0.6163183  0.67690943
 0.67580581 0.60573152 0.17208977 0.70807809 0.66461121 0.67780539
 0.44440614 0.2836182  0.6163183  0.67690943 0.67580581 0.60573152
 0.17208977 0.70807809 0.71131489 0.62599374 0.65128603 0.70922673
 0.72444398 0.64438936 0.42265531 0.66707268 0.28471846 0.60455563]


In [20]:
for number in range(0,20):
    Final= Final+RMSEcu[number]
#     print(Final)
    
Final=Final/20   
print("Root Mean Square Error:", Final)

Root Mean Square Error: 0.5525373871846707


In [None]:
#如果推user沒看過的 沒辦法知道actual rating
#所以設計成 用similar user來推movies看可以推中user原本看過的movie可以達到多好效果

In [21]:
#推薦movies
def recommend(df,user,similar_users ,top_n):
    # Find the movies the user hasn't seen and the similar users have seen.
    seen_movies = np.unique(df.loc[df["userId"]==user,"movieId"].values)
    Seen_movies = df["userId"].isin(seen_movies)
    Seen_movies_ratings = df[Seen_movies][["movieId","rating"]]
#     print(Seen_movies_ratings)
#     not_seen_cond = df["movieId"].isin(seen_movies)==False
#     similar_cond = df["userId"].isin(similar_users)
#     not_seen_movies_ratings = df[not_seen_cond & similar_cond][["movieId","rating"]]
    
    # Find the movies the similar users have seen.
    similar_cond = df["userId"].isin(similar_users)
    seen_movies_ratings = df[similar_cond][["movieId","rating"]]
#     print(seen_movies_ratings)

    # Find average ratings by the most similar users
    average_ratings = seen_movies_ratings.groupby("movieId").mean()
    average_ratings.reset_index(inplace=True)
    top_ratings = average_ratings.sort_values(by="rating",ascending=False).iloc[:top_n]
    top_ratings.reset_index(inplace=True,drop=True)
    
    Top_ratings = pd.merge(top_ratings[["movieId","rating"]], seen_movies_ratings[["movieId","rating"]], on='movieId')
    
    return Top_ratings

In [28]:
top_n = 10
top_ratings = recommend(train_data,user,similar_users[0], top_n)
print(f"Top-{top_n} average ratings by the most similar users:")
print(pd.merge(top_ratings, movie["movieId"], on='movieId'))
top_ratings

Top-10 average ratings by the most similar users:
    movieId  rating_x  rating_y
0       718       5.0         5
1      1368       5.0         5
2      1008       5.0         5
3       716       5.0         5
4       716       5.0         5
5       716       5.0         5
6       716       5.0         5
7      1377       5.0         5
8      1377       5.0         5
9      1377       5.0         5
10     1377       5.0         5
11     1160       5.0         5
12     1160       5.0         5
13     1160       5.0         5
14     1160       5.0         5
15      524       5.0         5
16      736       5.0         5
17      736       5.0         5
18      736       5.0         5
19      736       5.0         5
20     1134       5.0         5
21      753       5.0         5


Unnamed: 0,movieId,rating_x,rating_y
0,718,5.0,5
1,1368,5.0,5
2,1008,5.0,5
3,716,5.0,5
4,716,5.0,5
5,1377,5.0,5
6,1377,5.0,5
7,1160,5.0,5
8,1160,5.0,5
9,524,5.0,5


In [33]:
pred_top_ratings=top_ratings[["rating_x","rating_y"]]
print(pred_top_ratings)

true_relevance = np.full((1,21), 5)
scores = np.full((1,21), 5.0)
print(scores)

    rating_x  rating_y
0        5.0         5
1        5.0         5
2        5.0         5
3        5.0         5
4        5.0         5
5        5.0         5
6        5.0         5
7        5.0         5
8        5.0         5
9        5.0         5
10       5.0         5
11       5.0         5
12       5.0         5
13       5.0         5
[[5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.]]


In [34]:
# sort the dataframe
pred_top_ratings.sort_values(by='rating_x',ascending=False,inplace=True)
# remove rows with missing values
pred_top_ratings.dropna(inplace=True)
# convert ratings to binary labels
threshold = 3.5
pred_top_ratings = pred_top_ratings >= threshold
# view results
pred_top_ratings.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,rating_x,rating_y
0,True,True
1,True,True
2,True,True
3,True,True
4,True,True


In [35]:
def recall_at_k(df: pd.DataFrame, k: int=10, y_test: str='rating_y', y_pred: str='rating_x') -> float:
    """
    Function to compute recall@k for an input boolean dataframe
    
    Inputs:
        df     -> pandas dataframe containing boolean columns y_test & y_pred
        k      -> integer number of items to consider
        y_test -> string name of column containing actual user input
        y-pred -> string name of column containing recommendation output
        
    Output:
        Floating-point number of recall value for k items
    """
    # check we have a valid entry for k
    if k <= 0:
        raise ValueError('Value of k should be greater than 1, read in as: {}'.format(k))
    # check y_test & y_pred columns are in df
    if y_test not in df.columns:
        raise ValueError('Input dataframe does not have a column named: {}'.format(y_test))
    if y_pred not in df.columns:
        raise ValueError('Input dataframe does not have a column named: {}'.format(y_pred))
        
    # extract the k rows
    dfK = df.head(k)
    # compute number of all relevant items
    denominator = df[y_test].sum()
    # compute number of recommended items that are relevant @k
    numerator = dfK[dfK[y_pred] & dfK[y_test]].shape[0]
    # return result
    if denominator > 0:
        return numerator/denominator
    else:
        return None

In [36]:
k = 10
print('Recall@k: {:.2f} for k={}'.format(recall_at_k(pred_top_ratings,k),k))

Recall@k: 0.71 for k=10


In [57]:
import numpy as np
from sklearn.metrics import ndcg_score

In [67]:
ndcg_score(true_relevance, scores, k=10)

0.9999999999999999