# Movie Recommendation

## Library

In [1]:
import pandas as pd
import os
# Matrix Factorization
from implicit.als import AlternatingLeastSquares
import os
import numpy as np
from scipy.sparse import csr_matrix 

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')
ratings

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [4]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [5]:
ratings

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Analysis
- ratings에 있는 영화 개수
- rating에 있는 사용자 수
- 가장 인기있는 영화 30개

In [7]:
# 영화 갯수: 3628개
ratings['movie_id'].nunique()

3628

In [8]:
# 사용자 수: 6039명
ratings.user_id.nunique()

6039

In [9]:
# 가장 인기있는 영화 30개
top30 = ratings.groupby('movie_id')['count'].count().sort_values(ascending=False).head(30)
# title match
top30 = top30.reset_index()
top30.columns = ['movie_id','counts']
# 이름 확인하려 했는데 안나옴.
top30.merge(movies.iloc[:,0:2],on='movie_id',how='left')

Unnamed: 0,movie_id,counts,title
0,2858,3211,American Beauty (1999)
1,260,2910,Star Wars: Episode IV - A New Hope (1977)
2,1196,2885,Star Wars: Episode V - The Empire Strikes Back...
3,1210,2716,Star Wars: Episode VI - Return of the Jedi (1983)
4,2028,2561,Saving Private Ryan (1998)
5,589,2509,Terminator 2: Judgment Day (1991)
6,593,2498,"Silence of the Lambs, The (1991)"
7,1198,2473,Raiders of the Lost Ark (1981)
8,1270,2460,Back to the Future (1985)
9,2571,2434,"Matrix, The (1999)"


In [10]:
# user_id 별로 몇번 영화 봤는지 확인(평균 138.5번), 
user_count = ratings.groupby('user_id')['movie_id'].count()
print(user_count.describe())

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: movie_id, dtype: float64


### 선호하는 영화 5가지 rating에 추가

In [11]:
my_favorite = [318,1,2,260,586]
# user_id	movie_id	count	timestamp
my_movie = pd.DataFrame({'user_id':[10000]*5,
                         'movie_id':my_favorite,
                         'count':[5,5,5,5,5],
                        'timestamp':[978301760,978300100,978300670,978300710,978300780]})

if not ratings.isin({'user_id':[10000]})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    ratings = ratings.append(my_movie)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,count,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569
0,10000,318,5,978301760
1,10000,1,5,978300100
2,10000,2,5,978300670
3,10000,260,5,978300710
4,10000,586,5,978300780


In [12]:
# Indexing
# 고유한 유저, 영화를 찾아내는 코드
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [13]:
# check the indexing
print(user_to_idx[10000])
print(movie_to_idx[1])

6039
40


In [14]:
# Indexing 

# temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()

# if len(temp_user_data) == len(ratings):
#     print('user_id col indexing success')
#     ratings['user_id'] = temp_user_data
# else:
#     print('user_id col indexing fail')

In [15]:
# 영화 5점 비율 구하기
five_star = ratings[ratings['count']==5]
five, all_data = len(five_star), len(ratings)
print(f'{five},{all_data}')
print(f'Ratio of only_one over all data is {five/all_data:.2%}')

226315,836483
Ratio of only_one over all data is 27.06%


### CSR matrix 만들기

- Matrix Factorization 사용: 성능이 좋고, Scalability 좋음.
- [참고자료](https://towardsdatascience.com/recommendation-system-matrix-factorization-d61978660b4b)
   
![MF](https://miro.medium.com/max/700/1*Zhm1NMlmVywn0G18w3exog.png)
- m = 4, n = 2, k = 2   
- Rating Matrix를 User Matrix, Item Matrix로 분해  $U_i \bullet I_j = M_{ij}$   
[Collaborative Filtering for Implicit Feedback Datasets](http://yifanhu.net/PUB/cf.pdf)
- 메모리 낭비를 최소화 해야하므로 CSR Matrix 사용
[설명](https://stackoverflow.com/questions/53254104/cant-understand-scipy-sparse-csr-matrix-example/62118005#62118005)

In [16]:
ratings.loc[:,'count']

0    5
1    3
2    3
3    4
4    5
    ..
0    5
1    5
2    5
3    5
4    5
Name: count, Length: 836483, dtype: int64

In [17]:
# num_user = ratings.user_id.nunique()
# num_movie = ratings.movie_id.nunique()

csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)))
csr_data

<10001x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### als_model = AlternatingLeastSquares 모델

In [18]:
# als(AlternatingLeastSquares) model
# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [19]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, # 몇 차원
                                    regularization=0.01, # overfitting 막기 위해 정규화 값 얼마나?
                                    use_gpu=False, # GPU 사용여부
                                    iterations=50, # =epochs (몇번반복학습?)
                                    dtype=np.float32)

In [20]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x10001 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [21]:
# 모델 훈련
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




### 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악

In [22]:
my_id, father_sth = user_to_idx[10000], movie_to_idx[858]
my_id_vector, father_sth_vector = als_model.user_factors[my_id], als_model.item_factors[father_sth]

In [23]:
np.dot(my_id_vector,father_sth_vector)

0.0020905712

In [24]:
my_id, toy = user_to_idx[10000], movie_to_idx[1]
my_id_vector, toy_vector = als_model.user_factors[my_id], als_model.item_factors[toy]
np.dot(my_id_vector,toy_vector)

-0.008997703

### 내가 좋아하는 영화와 비슷한 영화 추천

In [25]:
favorite_movie = 1
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(40, 0.06438765),
 (1650, 0.039060637),
 (2341, 0.038876463),
 (767, 0.038818017),
 (1044, 0.038678255),
 (1728, 0.03793458),
 (83, 0.037395872),
 (1725, 0.037237294),
 (1624, 0.036928743),
 (1399, 0.036560576),
 (2711, 0.036558334),
 (1177, 0.036004044),
 (2276, 0.035792362),
 (753, 0.03502232),
 (2187, 0.034895673)]

In [26]:
#movie_to_idx 를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성합니다. 
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

[1,
 376,
 2240,
 2822,
 818,
 3509,
 1293,
 3285,
 1461,
 1130,
 1632,
 3506,
 406,
 3362,
 2121]

In [27]:
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [28]:
get_similar_movie(2)

[2, 2470, 2026, 2821, 3711, 3703, 2432, 2664, 988, 2157]

### 내가 가장 좋아할 만한 영화 추천

In [29]:
# 유저에게 아티스트 추천
user = user_to_idx[10000]
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(3061, 1.1015434),
 (910, 1.0992444),
 (3675, 0.94283676),
 (1267, 0.86207384),
 (594, 0.8601445),
 (898, 0.8429284),
 (1196, 0.8317854),
 (3671, 0.79763556),
 (1035, 0.79303175),
 (3606, 0.79231715),
 (908, 0.78371936),
 (2941, 0.76150584),
 (1247, 0.75781435),
 (1077, 0.75283056),
 (905, 0.74630404),
 (1244, 0.74396735),
 (928, 0.73969877),
 (1256, 0.7321372),
 (1032, 0.71718144),
 (2078, 0.68744016)]

In [30]:
[idx_to_movie[i[0]] for i in movie_recommended]

KeyError: 3675

In [None]:
# rihanna 추천한 이유 확인 (추천에 기여한 정도)
sixsixseven = movie_to_idx[667]
explain = als_model.explain(user, csr_data, itemid=sixsixseven)

In [None]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

## Conclusion
- 너무 오래된 영화들이라 공감이 잘 가지 않았지만 실습하기 좋았다.
- 영화가 아이디 처리가 되어있어서 매핑 후 이름으로 보면 좀 더 이해가 잘 갈 것 같다.
- 장르를 반영해서 추천을 해보면 좋을 것 같음.
