# Recommender system : collaborative Filtering

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


## 1. User based collaborative filtering

In [None]:
df = pd.DataFrame(data={
    "movie0":[1,0,0,1],
    "movie1":[1,0,0,1],
    "movie2":[0,1,1,0],
    "movie3":[0,1,0,0],
}, index=["user0","user1","user2","user3"])
df

In [None]:
user =2

In [None]:
sims = cosine_similarity(df)
sims

In [None]:
np.fill_diagonal(sims, 0)
sims


In [None]:
#Q. max와 argmax의 차이는?
themost_similar_user = np.argmax(sims[user])
themost_similar_user

In [None]:
movie_diff = df.values[themost_similar_user] - df.values[user]
if movie_diff.sum() <= 0:
    print("You have watched all")
else:
    for k,v in enumerate(movie_diff):

        if v == 1:
            movie_id = k
            print("It's time to watch movie"+str(movie_id))

## 2. Item based collaborative filtering

In [None]:
df = pd.DataFrame(data={
    "user0":[1,0,0,1],
    "user1":[1,0,0,1],
    "user2":[0,1,1,0],
    "user3":[0,1,0,0],
}, index=["movie0","movie1","movie2","movie3"])
df

In [None]:
movie =2

In [None]:
sims = cosine_similarity(df)
sims

In [None]:
#왜 대각 행렬을 0으로 만들어 줄까?
np.fill_diagonal(sims, 0)

In [None]:
themost_similar_movie = np.argmax(sims[movie])
themost_similar_movie

## MovieLens 데이터를 활용한 영화 협업 필터링 구현
- https://www.jillcates.com/pydata-workshop/html/tutorial.html

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ratings = pd.read_csv('movielens/ratings.csv')
movies = pd.read_csv('movielens/movies.csv')


### EDA

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
n_ratings = len(ratings)
n_movies = ratings['movieId'].nunique()
n_users = ratings['userId'].nunique()

print(f"평점의 개수: {n_ratings}")
print(f"영화의 수: {n_movies}")
print(f"사용자의 수: {n_users}")
print(f"사용자당 평균 평점 개수: {round(n_ratings/n_users, 2)}")
print(f"영화당 평균 평점개수: {round(n_ratings/n_movies, 2)}")

In [None]:
sns.countplot(x="rating", data=ratings, palette="viridis")
plt.title("Distribution of movie ratings", fontsize=14)
plt.show()


In [None]:
print(f"전체 평균 평점: {round(ratings['rating'].mean(),2)}.")

mean_ratings = ratings.groupby('userId')['rating'].mean()
print(f"유저 평균 평점: {round(mean_ratings.mean(),2)}.")


In [None]:
mean_ratings = ratings.groupby('movieId')[['rating']].mean()
lowest_rated = mean_ratings['rating'].idxmin()
movies[movies['movieId'] == lowest_rated]

In [None]:
highest_rated = mean_ratings['rating'].idxmax()
movies[movies['movieId'] == highest_rated]


### 데이터 Matrix화

In [None]:
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.

    Args:
        df: pandas dataframe containing 3 columns (userId, movieId, rating)

    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    M = df['userId'].nunique()
    N = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))

    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

In [None]:
X.shape

In [None]:
X.todense()

In [None]:
n_total = X.shape[0]*X.shape[1]
n_ratings = X.nnz
sparsity = n_ratings/n_total
# 매트릭스가 얼마나 희소한가
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

In [None]:
n_ratings_per_user = X.getnnz(axis=1)

print(f"가장 많이 평가한 유저 {n_ratings_per_user.max()}개의 영화.")
print(f"가장 적게 평가한 유저 {n_ratings_per_user.min()}개의 영화.")

In [None]:
n_ratings_per_movie = X.getnnz(axis=0)

print(f"가장 많이 평가된 영화는 {n_ratings_per_movie.max()}개의 평가가 있다.")
print(f"가장 적게 평가된 영화는 {n_ratings_per_movie.min()}개의 평가가 있다.")

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(1,2,1)
sns.kdeplot(n_ratings_per_user, shade=True)
plt.xlim(0)
plt.title("Number of Ratings Per User", fontsize=14)
plt.xlabel("number of ratings per user")
plt.ylabel("density")
plt.subplot(1,2,2)
sns.kdeplot(n_ratings_per_movie, shade=True)
plt.xlim(0)
plt.title("Number of Ratings Per Movie", fontsize=14)
plt.xlabel("number of ratings per movie")
plt.ylabel("density")
plt.show()

In [None]:
sum_ratings_per_movie = X.sum(axis=0)
mean_rating_per_movie = sum_ratings_per_movie/n_ratings_per_movie

In [None]:
X_mean_movie = np.tile(mean_rating_per_movie, (X.shape[0],1))


In [None]:
X_mean_movie.shape


## 데이터 Scaling

In [None]:
# 평점 0 즉 평점을 입력하지 않은 interaction에 대해 음수 값을 적용하기 위해!
X_norm = X - csr_matrix(X_mean_movie)


In [None]:
print("Original X:", X[0].todense())
print("Normalized X:", X_norm[0].todense())


## K-Nearest Neighbour를 활용한 가장 가까운 영화 추천
- 여기서는 분류를 하는게 아니라, 가장 가까운 K개의 이웃들을 찾는 과정

![img](https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/KnnClassification.svg/1024px-KnnClassification.svg.png)

In [None]:
from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, k, metric='cosine'):
    """
    Finds k-nearest neighbours for a given movie id.

    Args:
        movie_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations

    Output: returns list of k similar movie ID's
    """
    X = X.T
    neighbour_ids = []

    movie_ind = movie_mapper[movie_id]
    #특정 영화의 전체 유저들에 의한 평점으로 이루어진 배열
    movie_vec = X[movie_ind]
    
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
        
    # use k+1 since kNN output includes the movieId of interest
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    # 전체 영화 벡터에 대한 KNN학습
    kNN.fit(X)
    # 특정 영화와 가장 가까운 영화를 학습된 KNN에서 찾기
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    #왜 첫번째 아이템은 버릴까?
    neighbour_ids.pop(0)
    return neighbour_ids

## 검증

In [None]:
similar_movies = find_similar_movies(1, X_norm, movie_mapper, movie_inv_mapper, k=10)
similar_movies

In [None]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

movie_id = 59315

similar_movies = find_similar_movies(movie_id, X_norm, movie_mapper, movie_inv_mapper, metric='cosine', k=10)
movie_title = movie_titles[movie_id]

print(f"Because you watched [{movie_title}]:")
for i in similar_movies:
    print(movie_titles[i])

In [None]:
movie_id = 59315

similar_movies = find_similar_movies(movie_id, X_norm, movie_mapper, movie_inv_mapper, metric='euclidean', k=10)
movie_title = movie_titles[movie_id]

print(f"Because you watched {movie_title}:")
for i in similar_movies:
    print(movie_titles[i])