# 14. 아이유팬이 좋아할 만한 다른 아티스트 찾기

In [1]:
import pandas as pd
import numpy as np
import os, copy
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

## 데이터 준비와 전처리

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


timestamp 지우기

In [3]:
ratings = ratings.iloc[:,0:3]
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


별점 3점 이상만 남기기

In [4]:
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


컬럼 이름을 rating에서 count로 바꿔주기(rating은 별점인데 count로 바꿔주면서 시청횟수로 생각한다)

In [5]:
ratings.rename(columns={'rating':'count'}, inplace=True)

영화 제목을 보기 위한 메타 데이터(사용자가 별점을 매긴 영화 목록)

In [6]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


결측치 확인

In [7]:
sum(movies['title'].isna())

0

movies데이터 title을 소문자로 바꿔주기

In [8]:
movies['title_lower'] = movies['title'].str.lower()

In [9]:
movies.head(5)

Unnamed: 0,movie_id,title,genre,title_lower
0,1,Toy Story (1995),Animation|Children's|Comedy,toy story (1995)
1,2,Jumanji (1995),Adventure|Children's|Fantasy,jumanji (1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,grumpier old men (1995)
3,4,Waiting to Exhale (1995),Comedy|Drama,waiting to exhale (1995)
4,5,Father of the Bride Part II (1995),Comedy,father of the bride part ii (1995)


## 데이터 분석

ratings에 있는 유니크한 영화 개수

In [11]:
ratings['movie_id'].nunique()

3628

ratings에 있는 유니크한 사용자 수

In [12]:
# number of unique users in dataset
ratings['user_id'].nunique()

6039

가장 인기 있는 영화 30개(인기순)

In [60]:
artist_count = ratings.groupby('movie_id')['user_id'].count()  #movie_id값을 기준으로 groupby
artist_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

영화 제목 찾기

In [13]:
def find_movie(string):
    for i in range(len(movies['title_lower'])):
        if movies['title_lower'][i].find(string) != -1:
            print(f"movie: {movies['title'][i]}, id: {movies['movie_id'][i]}")

In [14]:
find_movie('hill')  #Notting Hill (1999), id: 2671

movie: Federal Hill (1994), id: 406
movie: Beverly Hillbillies, The (1993), id: 419
movie: Beverly Hills Cop III (1994), id: 420
movie: Englishman Who Went Up a Hill, But Came Down a Mountain, The (1995), id: 468
movie: King of the Hill (1993), id: 483
movie: Paradise Lost: The Child Murders at Robin Hood Hills (1996), id: 1361
movie: Beverly Hills Ninja (1997), id: 1431
movie: Slums of Beverly Hills, The (1998), id: 2155
movie: Downhill (1927), id: 2224
movie: Big Chill, The (1983), id: 2352
movie: House on Haunted Hill (1958), id: 2519
movie: Pork Chop Hill (1959), id: 2669
movie: Notting Hill (1999), id: 2671
movie: Chill Factor (1999), id: 2835
movie: House on Haunted Hill, The (1999), id: 2995
movie: Hillbillys in a Haunted House (1967), id: 3460


In [15]:
find_movie('volcano')  #Volcano (1997), id: 1515

movie: Volcano (1997), id: 1515


In [16]:
find_movie('show')  #Truman Show, The (1998), id: 1682

movie: Show, The (1995), id: 192
movie: Showgirls (1995), id: 193
movie: Quiz Show (1994), id: 300
movie: Truman Show, The (1998), id: 1682
movie: Greatest Show on Earth, The (1952), id: 1943
movie: Rocky Horror Picture Show, The (1975), id: 2657
movie: Creepshow (1982), id: 3016
movie: Creepshow 2 (1987), id: 3017
movie: Last Picture Show, The (1971), id: 3152
movie: Shower (Xizhao) (1999), id: 3787
movie: Best in Show (2000), id: 3911


In [17]:
find_movie('toy story')  # Toy Story (1995), id: 1

movie: Toy Story (1995), id: 1
movie: Toy Story 2 (1999), id: 3114


In [18]:
find_movie('before')  #Before Sunrise (1995), id: 215

movie: Before and After (1996), id: 113
movie: Before the Rain (Pred dozhdot) (1994), id: 214
movie: Before Sunrise (1995), id: 215
movie: Nightmare Before Christmas, The (1993), id: 551
movie: Land Before Time III: The Time of the Great Giving (1995), id: 888


In [19]:
my_movie = [1515, 215, 1682, 1, 2671] 

좋아하는 영화에 별점 5점 주기

In [20]:
myfav = {'user_id': [6041]*5, 'movie_id': my_movie, 'count':[5]*5}
myfav = pd.DataFrame(myfav)
myfav

Unnamed: 0,user_id,movie_id,count
0,6041,1515,5
1,6041,215,5
2,6041,1682,5
3,6041,1,5
4,6041,2671,5


데이터 합치기

In [21]:
data = copy.deepcopy(ratings) # copy ratings dataset to object data
if not ratings.isin({'user_id':['6041']})['user_id'].any():  
    data = data.append(myfav) # add my favorites to dataset                          
data.tail(10)

Unnamed: 0,user_id,movie_id,count
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4
0,6041,1515,5
1,6041,215,5
2,6041,1682,5
3,6041,1,5
4,6041,2671,5


In [22]:
num_user=data['user_id'].nunique()
num_user

6040

In [23]:
num_user=data['user_id'].nunique()
num_user

3628

In [26]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = data['user_id'].unique()
movie_unique = data['movie_id'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [27]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = data['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(data):
    print('movie column indexing OK!!')
    data['movie_id'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

data

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,count
0,0,0,5
1,0,1,3
2,0,2,3
3,0,3,4
4,0,4,5
...,...,...,...
0,6039,2465,5
1,6039,213,5
2,6039,385,5
3,6039,40,5


In [28]:
len(data)

836483

In [30]:
data['count'].size

836483

In [31]:
data['user_id'].size

836483

In [32]:
data['movie_id'].size

836483

In [59]:
csr_data = csr_matrix((data['count'], (data.user_id, data.movie_id)), shape= (num_user, num_movie))
csr_data 

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [39]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

## Implicit AlternatingLeastSquares 모델의 선언

In [40]:
als_model = AlternatingLeastSquares(factors=128, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

als 모델은 input으로

In [41]:
# als model takes in item x user shaped matrix as input; as csr_data is created as user x item it needs to be transposed
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

## 모델 훈련

In [42]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [48]:
# get my user vector from trained model
als_model.user_factors[6039]

array([-0.56939393, -0.00663457,  0.30873692,  0.07559075, -0.19237646,
       -0.13198552, -0.09672124,  0.15444237,  0.05045356,  0.19502957,
       -0.01299298,  0.3020425 ,  0.57330245, -0.2284207 , -0.05646583,
        0.81403166,  0.60087496,  0.24036767,  0.80627   ,  0.330146  ,
        0.28888777,  0.2247081 ,  0.41264036, -0.30002305, -0.11570273,
       -0.39915684,  0.04017757, -0.11698263, -0.18739295,  0.31681982,
       -0.5111319 , -0.01370466, -0.24154994, -0.63750213,  0.23833863,
        0.28799298,  0.41193816, -0.07851811,  0.37024707, -0.19436455,
       -0.14855023, -0.41413727,  0.03446487,  0.2162128 ,  0.3354319 ,
       -0.29721525,  0.89515686, -0.08063932, -0.2512997 , -0.14672117,
       -0.3479793 , -0.28795946,  0.4673346 , -0.2433402 ,  0.08607879,
        0.06903581,  0.38980204, -0.28623748,  0.29968926,  0.16902953,
        0.3871752 ,  0.46077254,  0.2268375 ,  0.08636063,  0.03274453,
       -0.31178662, -0.3248436 , -0.01583325, -0.83853817, -0.05

In [49]:
movies[movies['movie_id'].isin(myfav['movie_id'])]

Unnamed: 0,movie_id,title,genre,title_lower
0,1,Toy Story (1995),Animation|Children's|Comedy,toy story (1995)
213,215,Before Sunrise (1995),Drama|Romance,before sunrise (1995)
1480,1515,Volcano (1997),Drama|Thriller,volcano (1997)
1636,1682,"Truman Show, The (1998)",Drama,"truman show, the (1998)"
2602,2671,Notting Hill (1999),Comedy|Romance,notting hill (1999)


In [50]:
# pick random movie
random_movie = np.random.choice(movies['movie_id'])
movies[movies['movie_id']==random_movie]

Unnamed: 0,movie_id,title,genre,title_lower
3122,3191,"Quarry, The (1998)",Drama,"quarry, the (1998)"


In [52]:
# get favor score of inspector gadget and before sunrise
r_mov_vec, fav_mov_vec =  als_model.item_factors[2720], als_model.item_factors[215]
my_vec = als_model.user_factors[6039]
print(f' inspector gadget favor score : {np.dot(my_vec, r_mov_vec)} , before sunrise favor score : {np.dot(my_vec, fav_mov_vec)}') 

 inspector gadget favor score : 0.005873588845133781 , before sunrise favor score : 0.05734647065401077


In [53]:
# before sunrise와 비슷한 영화 찾아보기

similar_movie = als_model.similar_items(215, N=10)
for x in range(len(similar_movie)):
    print(movies['title'][movies['movie_id'].isin(similar_movie[x])])

0           Toy Story (1995)
213    Before Sunrise (1995)
Name: title, dtype: object
2518    Life (1999)
Name: title, dtype: object
258    Little Women (1994)
Name: title, dtype: object
272    Mixed Nuts (1994)
Name: title, dtype: object
3241    Kid, The (1921)
Name: title, dtype: object
2475    School of Flesh, The (L' École de la chair) (1...
Name: title, dtype: object
2933    My Best Fiend (Mein liebster Feind) (1999)
Name: title, dtype: object
2472    Cruel Intentions (1999)
Name: title, dtype: object
1716    Blues Brothers 2000 (1998)
Name: title, dtype: object
435    Dangerous Game (1993)
Name: title, dtype: object


In [55]:
movies_recommended = als_model.recommend(6039, csr_data, N=20, filter_already_liked_items=True)
movies_recommended

[(50, 0.42415905),
 (322, 0.291341),
 (384, 0.27783117),
 (4, 0.2735495),
 (39, 0.2477045),
 (139, 0.24583608),
 (248, 0.24059623),
 (110, 0.23902464),
 (626, 0.21532747),
 (666, 0.21518186),
 (5, 0.20192853),
 (336, 0.18748227),
 (33, 0.18154499),
 (160, 0.17775194),
 (851, 0.17738283),
 (1352, 0.1749626),
 (330, 0.16698794),
 (217, 0.16220742),
 (466, 0.15357542),
 (20, 0.15127403)]

In [56]:
movies_recommended[0][0]

50

In [57]:
len(movies_recommended)

20

## 영화 추천

In [58]:
for x, y in movies_recommended:
    title = movies['title'][movies['movie_id'].isin([x])]
    print(f'추천영화 : {title}', end = ' ')
    print(f'선호도 : {y:.3f}')

추천영화 : 49    Usual Suspects, The (1995)
Name: title, dtype: object 선호도 : 0.424
추천영화 : 319    Swimming with Sharks (1995)
Name: title, dtype: object 선호도 : 0.291
추천영화 : 380    Bad Company (1995)
Name: title, dtype: object 선호도 : 0.278
추천영화 : 3    Waiting to Exhale (1995)
Name: title, dtype: object 선호도 : 0.274
추천영화 : 38    Clueless (1995)
Name: title, dtype: object 선호도 : 0.248
추천영화 : 137    Target (1995)
Name: title, dtype: object 선호도 : 0.246
추천영화 : 245    Houseguest (1994)
Name: title, dtype: object 선호도 : 0.241
추천영화 : 108    Braveheart (1995)
Name: title, dtype: object 선호도 : 0.239
추천영화 : 621    Thin Line Between Love and Hate, A (1996)
Name: title, dtype: object 선호도 : 0.215
추천영화 : 660    All Things Fair (1996)
Name: title, dtype: object 선호도 : 0.215
추천영화 : 4    Father of the Bride Part II (1995)
Name: title, dtype: object 선호도 : 0.202
추천영화 : 332    Walking Dead, The (1995)
Name: title, dtype: object 선호도 : 0.187
추천영화 : 32    Wings of Courage (1995)
Name: title, dtype: object 선호도 : 0.182
추천영화

## Retrospection

영화 추천 목록을 검색해 봤는데 내가 좋아할 만한 것이다. 추천이 잘 된 것 같다. csr을 불러오는 과정에서 에러가 났다.

<에러가 난 코드>

##### csr_data = csr_matrix((data['count'], (data.user_id, data.movie_id)), shape= (num_user, num_movie)) 

count의 변수 명을 counts가 아닌 count로 설정을 했다. 파이썬에서는 count라는 함수가 있는데 처음에 내가 코드를 data.count로 코드를 짜서 변수count와 함수count가 꼬여서 오류가 났던 것 같다. 그래서 data['count']로 수정했더니 실행이 되었다 !!

전처리를 하는 과정에서 좋아하는 영화를 추가할 때 인덱스를 6041이 아닌 zziho로 이름을 붙여주고 싶었는데 에러가 났다. 이유는 앞의 인덱스가 int형이고 zziho는 str형태이기 때문에 충돌이 났다. 그래서 6041로 이름을 붙여줬다. 그래서 오류를 해결했다. 1~6040의 인덱스를 int형태가 아닌 str형태로 바꾼 후 zziho로 인덱스를 추가해 주어도 에러가 해결이 될 거 같다는 생각이 지금 회고를 쓰면서 생각이 났다. 하지만 시도를 했다가는 익스 제출을 영영 하지 못할 것 같아서 다음 기회에 시도해 보겠다.
