# 프로젝트 : 영화 추천 실습

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import os
import numpy as np
import random

## STEP 1. 데이터 준비와 전처리

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


* 이미 인덱싱이 완료된 데이터입니다.

In [3]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


* 3점 이상의 데이터만 남깁니다.

In [4]:
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## STEP 2. 데이터 분석

#### ratings에 있는 유니크한 영화 개수

In [6]:
ratings['movie_id'].nunique()

3628

#### ratings에 있는 유니크한 유저 수

In [7]:
ratings['user_id'].nunique()

6039

#### 가장 인기있는 영화(별점을 많이 받은 순)

In [8]:
id_to_movie = {k:v for k,v in zip(movies['movie_id'],movies['title'])}

In [9]:
#id를 영화명으로 변경해 출력합니다.
movie_count = ratings.groupby('movie_id')['counts'].sum()
movie_count.rename(id_to_movie,inplace= True)
movie_count.sort_values(ascending=False).head(30)

movie_id
American Beauty (1999)                                   14449
Star Wars: Episode IV - A New Hope (1977)                13178
Star Wars: Episode V - The Empire Strikes Back (1980)    12648
Saving Private Ryan (1998)                               11348
Star Wars: Episode VI - Return of the Jedi (1983)        11303
Raiders of the Lost Ark (1981)                           11179
Silence of the Lambs, The (1991)                         11096
Matrix, The (1999)                                       10903
Sixth Sense, The (1999)                                  10703
Terminator 2: Judgment Day (1991)                        10513
Fargo (1996)                                             10465
Schindler's List (1993)                                  10317
Braveheart (1995)                                        10125
Shawshank Redemption, The (1994)                         10085
Back to the Future (1985)                                10081
Godfather, The (1972)                         

## STEP 3. 선호하는 영화

In [10]:
my_id = ratings['user_id'].max()+1
movie_to_id = {v:k for k,v in zip(movies['movie_id'],movies['title'])}

In [11]:
#영화 리스트에서 랜덤하게 5가지 영화를 선택해 선호하는 영화로 삼습니다.
my_favorite = []
for i in range(4):
    idx = random.randrange(0,3628)
    my_favorite.append(movies['movie_id'][idx])
my_favorite.append(movie_to_id['Godfather, The (1972)'])
my_favorite

[1555, 2410, 3008, 980, 858]

* 대부는 진짜 좋아하는 영화라 꼭 넣습니다.

In [12]:
#선택된 영화들입니다.
for i in my_favorite:
    print(id_to_movie[i])

To Have, or Not (1995)
Rocky III (1982)
Last Night (1998)
In the Line of Duty 2 (1987)
Godfather, The (1972)


In [13]:
for i in range(5):
    #별점은 3~5점 사이
    rating = random.randrange(3,6)
    ss = pd.DataFrame({'user_id': [my_id], 'movie_id': [my_favorite[i]], 'counts':[rating]})
    ratings = ratings.append(ss)

In [14]:
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,6041,1555,5,
0,6041,2410,4,
0,6041,3008,4,
0,6041,980,3,
0,6041,858,5,


## STEP 4. CSR matrix

In [15]:
# 중간에 비어있는 id도 있기 때문에 id 최대치로 사용합니다.
num_user = ratings['user_id'].max()+1
num_movie = ratings['movie_id'].max()+1
csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## STEP 5. 모델 구성 및 훈련

In [16]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [17]:
als_model = AlternatingLeastSquares(factors=150, regularization=0.01, use_gpu=False, iterations=150, dtype=np.float32)

In [18]:
csr_data_transpose = csr_data.T

In [19]:
als_model.fit(csr_data_transpose)

  0%|          | 0/150 [00:00<?, ?it/s]

## STEP 6. 나의 선호도 파악

In [20]:
for i in my_favorite:
    my_vector, movie_vector = als_model.user_factors[my_id], als_model.item_factors[i]
    favor = np.dot(my_vector, movie_vector)
    print(id_to_movie[i],favor)

To Have, or Not (1995) 0.0057720738
Rocky III (1982) 0.21683235
Last Night (1998) 0.043589287
In the Line of Duty 2 (1987) 0.0075432695
Godfather, The (1972) 0.722719


* 유독 선호도가 높게나온 영화가 Rocky와 Godfather인데 아마 각각 법적, 범죄인 마니아틱한 장르 특성상 높은 결과가 나온것으로 생각됩니다.

In [26]:
#다른 영화의 경우
my_vector, movie_vector = als_model.user_factors[my_id], als_model.item_factors[1]
favor = np.dot(my_vector, movie_vector)
print(id_to_movie[1],favor)

Toy Story (1995) 0.02527825


## STEP 7. 내가 좋아하는 영화와 비슷한 영화를 추천

In [22]:
def get_similar_movies(movie_name: str):
    movie_id = movie_to_id[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [id_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [23]:
get_similar_movies('Godfather, The (1972)')

['Godfather, The (1972)',
 'Godfather: Part II, The (1974)',
 'Godfather: Part III, The (1990)',
 'French Connection, The (1971)',
 'Boys, The (1997)',
 'Paralyzing Fear: The Story of Polio in America, A (1998)',
 'Aiqing wansui (1994)',
 'Simon Sez (1999)',
 'Condition Red (1995)',
 'GoodFellas (1990)']

* 대부분 범죄/스릴러 장르가 추천됩니다.

In [24]:
get_similar_movies('Erin Brockovich (2000)')

['Erin Brockovich (2000)',
 'Song of Freedom (1936)',
 'Chain of Fools (2000)',
 '28 Days (2000)',
 'Cider House Rules, The (1999)',
 'Keeping the Faith (2000)',
 'Boiler Room (2000)',
 'Wonder Boys (2000)',
 'Runaway Bride (1999)',
 'Mickey Blue Eyes (1999)']

* 대부분 드라마/로맨스 장르가 추천됩니다.

## STEP 8. 내가 좋아할만한 영화를 추천

In [25]:
movie_recommended = als_model.recommend(my_id, csr_data, N=20, filter_already_liked_items=True)
[id_to_movie[i[0]] for i in movie_recommended]

['Godfather: Part II, The (1974)',
 'Godfather: Part III, The (1990)',
 'Rocky II (1979)',
 'Rocky (1976)',
 'Rocky IV (1985)',
 'Karate Kid, Part II, The (1986)',
 'French Connection, The (1971)',
 'Run Lola Run (Lola rennt) (1998)',
 'Jaws 2 (1978)',
 'Rambo: First Blood Part II (1985)',
 'Sleepy Hollow (1999)',
 'Death Wish (1974)',
 'Bloodsport (1988)',
 'Karate Kid, The (1984)',
 'Citizen Kane (1941)',
 'Fatal Attraction (1987)',
 'Defending Your Life (1991)',
 'Mr. Mom (1983)',
 'Superman II (1980)',
 'First Blood (1982)']

* 대부분 제가 선호하는 영화와 비슷한 장르가 추천되는것을 볼수있습니다.

# 회고합시다.
* 훈련을 마치고 선호도를 계산했을 때 확실히 장르별로 결과가 크게 달라지는것을 보았습니다.
* 메이저한 장르는 영화 하나를 평가한 것으로 크게 영향이 없었고 매니악한 장르는 영화 하나를 평가해도 큰 영향을 미치는 것으로 보였습니다.(평가한 영화가 많아질수록 더 정확히 예측할것입니다.)