In [3]:
! pip install surprise



In [4]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [5]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
cd ./drive/MyDrive/Colab\ Notebooks

/content/drive/MyDrive/Colab Notebooks


In [28]:
df_ratings = pd.read_csv("./data/others/ratings.csv")

In [29]:
n_users = df_ratings.userId.unique().shape[0]
n_items = df_ratings.movieId.unique().shape[0]
print(f"num of users : {n_users} / num of movies : {n_items}")

num of users : 610 / num of movies : 9724


In [30]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [32]:
df_ratings = df_ratings[['userId', 'movieId', 'rating']]

In [31]:
df_movies = pd.read_csv('./data/others/movies.csv')

df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [34]:
### Add Your Own Data ### 

###################################### Example 1#################################################
# User 800 is a HUGE fan of Musical Movies
rows = []                               # row = [user_id, movie_id, rating]
user_id = 800
rows.append([user_id, 73, 5])        # movie    73: Miserables, Les (1995)
rows.append([user_id, 107780, 5])     # movie  107780: Cats(1998) 
rows.append([user_id, 588, 5])     # movie  588: Aladin(1992)
rows.append([user_id, 60397, 5])    # movie 69397: Mamma Mia!(2008)
rows.append([user_id, 99149, 5])    # movie 99149: Miserables, Les (2012)
rows.append([user_id, 138186, 1])    # movie 138186: Sorrow(2015)
rows.append([user_id, 1997, 1])    # movie 1997: Scream 2 (1991)

##################################################################################################

###################################### Example 2#################################################
# User 900 is a HUGE fan of Animation Movies
rows = []                               # row = [user_id, movie_id, rating]
user_id = 900
rows.append([user_id, 1022, 5])        # movie    1022: Cinderella(1950)
rows.append([user_id, 594, 5])     # movie  594: Snow White and the Seven Dwarfs(1937) 
rows.append([user_id, 106696, 5])     # movie  106696: Frozen(2013)
rows.append([user_id, 166461, 5])    # movie 166461: Moana(2016)
rows.append([user_id, 595, 5])    # movie 595: Beauty and the Beast (1991)
rows.append([user_id, 138168, 1])    # movie 138168: Sorrow(2015)
rows.append([user_id, 1997, 1])    # movie 1997: Scream 2 (1991)

##################################################################################################


########################### Add Your Own Ratings using 'movie.csv' data #########################
# my_rows = []
# my_id = 2021
# rows.append([user_id, ,])       # Fill your movie id and rating     
# rows.append([user_id, ,])       # 여러분이 평가할 영화의 id와 점수를 입력하세요.
# rows.append([user_id, ,])
# rows.append([user_id, ,])
# rows.append([user_id, ,])

##################################################################################################
for row in rows:
    df_ratings = df_ratings.append(pd.Series(row, index=df_ratings.columns), ignore_index=True)
print(df_ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
100838     900   106696     5.0
100839     900   166461     5.0
100840     900      595     5.0
100841     900   138168     1.0
100842     900     1997     1.0

[100843 rows x 3 columns]


In [35]:
movie_set = set()     
ratings = np.zeros((n_users, n_items))
for (_, movie_id, _) in df_ratings.itertuples(index=False):
    movie_set.add(movie_id)

movie_id_to_name=dict()
movie_id_to_genre=dict()

for (movie_id, movie_name, movie_genre) in df_movies.itertuples(index=False):
    if movie_id not in movie_set:              # 어떤 영화가 rating data에 없는 경우 skip
        continue
    movie_id_to_name[movie_id] = movie_name 
    movie_id_to_genre[movie_id] = movie_genre

In [36]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader=reader)
train, test = train_test_split(data, test_size = 0.2, shuffle = True)

In [37]:
print(type(data))
print(type(train))

iterator = train.all_ratings()
train_df = pd.DataFrame(columns=['userId', 'movieId', 'rating'])

i=0
for uid, iid, rating in iterator:
  train_df.loc[i] = [train.to_raw_uid(int(uid)), train.to_raw_iid(int(iid)), rating]
  i += 1

train_data = Dataset.load_from_df(train_df, reader = reader)

print(type(data))
print(type(train))

<class 'surprise.dataset.DatasetAutoFolds'>
<class 'surprise.trainset.Trainset'>
<class 'surprise.dataset.DatasetAutoFolds'>
<class 'surprise.trainset.Trainset'>


In [38]:
param_grid = {'n_factors':[50, 100, 150, 200]}
grid = GridSearchCV(SVD, param_grid, measures = ['rmse'], cv = 4)
grid.fit(train)

print(grid.best_score['rmse'])
print(grid.best_params['rmse'])

AttributeError: ignored

In [None]:
print(grid.best_params)

algorithm = SVD(grid.best_params['rmse']['n_factors'])
algorithm.fit(train)

In [None]:
prediction = algorithm.test(test)

for p in prediction[:10]:
  print(p)

In [None]:
#### 특정 user, 특정 item에 대한 prediction 값 ###
uid = 800
iid = 8368
prediction_user_item = algorithm.predict(uid, iid)
print(prediction_user_item)     

In [None]:
##############################################################
##### 해당 user가 아직 보지 않은 영화를 return해주는 함수#####
##############################################################
def get_unseen_movies(data, user_id):

    watched_movies = set()
    total_movies = set()
    ########### Fill in Your Code #################
    for (uid, iid, rating) in data.all_ratings():
        
        total_movies.add(iid)
        if uid == user_id:
            watched_movies.add(iid)
    
    unseen_movies = total_movies - watched_movies
    ##################################################
    return unseen_movies
    # return total_movies

In [None]:
################################################################################
############# 특정 user에게 top k개의 영상을 추천해주는 함수 ###################
################################################################################
def recommend(train, algorithm, user_id, top_k=10):
    ################ Fill in Your Code ########################################
    unseen_movies = get_unseen_movies(train, user_id)
    prediction = [algorithm.predict(user_id, movie_id) for movie_id in unseen_movies]

    prediction.sort(key=lambda x:x.est, reverse=True)  

    ###########################################################################
    for _, movie, _, pred, _ in prediction[:top_k]:
        print("movid id: {}, movie genre: {},predicted rating: {}".format(movie_id_to_name[movie], movie_id_to_genre[movie], pred))


In [None]:
#########################################
####### 800번 유저의 추천 결과 ##########
#########################################

recommend(train, algorithm, user_id=800, top_k=20)
