# 상품 기반 협업 필터링

## 참고 자료

* [Item-based Collaborative Filtering : Build Your own Recommender System!](https://www.analyticsvidhya.com/blog/2021/05/item-based-collaborative-filtering-build-your-own-recommender-system/), Saumyab271, 2023.11.08

## 준비

### 라이브러리

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns

### 데이터 세트

* [MovieLens 100K Dataset](https://grouplens.org/datasets/movielens/100k/)

In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
dataset = pd.read_csv('u.data', sep = '\t', names = header)
print(dataset.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [18]:
item_header = ['movie_id', 'movie_title']
item_dataset = pd.read_csv('u.item', sep = '|', header=None)
item_dataset = item_dataset.iloc[:, 0:2]
item_dataset.set_axis(item_header, axis=1, inplace=True)
print(item_dataset.head())

   movie_id        movie_title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


## 사용자-영화 평가 테이블 만들기

In [3]:
n_users = dataset.user_id.unique().shape[0]
n_items = dataset.item_id.unique().shape[0]
n_items = dataset['item_id'].max()
A = np.zeros((n_users,n_items))
for line in dataset.itertuples():
    A[line[1]-1,line[2]-1] = line[3]
    
print('n_users: ', n_users)
print('n_items: ', n_items)
print("Original rating matrix : ",A)

n_users:  943
n_items:  1682
Original rating matrix :  [[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]


### 이진 평가로 대체

In [4]:
for i in range(len(A)):
  for j in range(len(A[0])):
    if A[i][j]>=3:
      A[i][j]=1
    else:
      A[i][j]=0

print(A)
print(A[0, 0:20])

[[1. 1. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
[1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [39]:
csr_sample = csr_matrix(A)
print(csr_sample.shape)

(943, 1682)


## 유사도 계산

In [6]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=3, n_jobs=-1)
knn.fit(csr_sample)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=3)

## 영화 추천

### 한 사용자가 좋아한 영화 목록 추출

In [20]:
dataset_sort_des = dataset.sort_values(['user_id', 'timestamp'], ascending=[True, False])
filter1 = dataset_sort_des[dataset_sort_des['user_id'] == 1].item_id
filter1 = filter1.tolist()
filter1 = filter1[:20]
print("Items liked by user: ",filter1)
movie_titles = [item_dataset.iloc[item_id, 1] for item_id in filter1]
print(movie_titles)

Items liked by user:  [74, 102, 256, 5, 171, 111, 242, 189, 32, 209, 270, 18, 6, 244, 221, 129, 20, 271, 272, 255]
['Brother Minister: The Assassination of Malcolm X (1994)', 'All Dogs Go to Heaven 2 (1996)', 'Men in Black (1997)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Empire Strikes Back, The (1980)', 'Flipper (1996)', 'Jungle2Jungle (1997)', 'Henry V (1989)', 'Desperado (1995)', 'Indiana Jones and the Last Crusade (1989)', 'Starship Troopers (1997)', "Antonia's Line (1995)", 'Twelve Monkeys (1995)', "Devil's Own, The (1997)", 'Star Trek: First Contact (1996)', 'Kansas City (1996)', 'Muppet Treasure Island (1996)', 'Good Will Hunting (1997)', 'Heat (1995)', 'When the Cats Away (Chacun cherche son chat) (1996)']


### 유사한 영화 추천

한 사용자가 좋아하는 영화에 대해 유사한 영화들을 찾아서 추천합니다.

In [28]:
distances1=[]
indices1=[]
for i in filter1:
  distances, indices = knn.kneighbors(csr_sample[i],n_neighbors=3)
  
  distances = distances.flatten()
  distances = distances[1:]
  distances1.extend(distances)
  
  indices = indices.flatten()
  indices = indices[1:]
  indices1.extend(indices)

  print(item_dataset.iloc[i, 1])
  movie_titles = [item_dataset.iloc[item_id, 1] for item_id in indices]
  for title in movie_titles:
    print('                ', title)

Brother Minister: The Assassination of Malcolm X (1994)
                 One Flew Over the Cuckoo's Nest (1975)
                 Dumbo (1941)
All Dogs Go to Heaven 2 (1996)
                 Fair Game (1995)
                 Third Man, The (1949)
Men in Black (1997)
                 Year of the Horse (1997)
                 Home Alone 3 (1997)
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
                 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
                 White Balloon, The (1995)
Empire Strikes Back, The (1980)
                 Midnight in the Garden of Good and Evil (1997)
                 Wes Craven's New Nightmare (1994)
Flipper (1996)
                 Kids (1995)
                 Tin Cup (1996)
Jungle2Jungle (1997)
                 39 Steps, The (1935)
                 Diabolique (1996)
Henry V (1989)
                 Great Expectations (1998)
                 Abyss, The (1989)
Desperado (1995)
                 Lawrence of Arabia (19