# 1. Collaborative Filtering (협업 필터링)
- 추천 시스템: 고객의 선호, 관심, 구매경력과 같은 개인화 정보를 기초로 고객에게 가장 알맞은 구매정보 제공
- 방식
  - Item-based collaborative filtering: item 간 similarity를 기반으로 추천
  - User-based collaborative filtering: user 간 similarity를 기반으로 추천
    - 성능이 다른 방식에 비해 떨어지는 것으로 알려져있음
  - Matrix factorization collaborative filtering: 잠재요인이 있다고 가정하여, 그 잠재요인을 행렬분해를 통해 찾아냄
    - 일반적으로 SVD(singular value decomposition)을 활용

- Input: user-item matrix (preference matrix)
  - 일반적으로 5점 척도로 구성된 rating + cosine similarity
  - binary인 경우 jaccard similarity 사용

- 한계
  - 단순 matrix를 사용하여 추천하기 때문에 context/content를 고려하지 않음
    - 최근에는 item의 text 정보, user의 context 정보들을 반영하여 딥러닝 기반의 추천 시스템 등장

In [None]:
import pandas as pd
df = pd.read_csv('movies.csv')

In [None]:
df.head()

Unnamed: 0,userId,title,rating,timestamp
0,1,American Pie,4.0,1260759139
1,4,American Pie,4.0,949896114
2,15,American Pie,4.0,1052896867
3,30,American Pie,2.0,994439964
4,34,American Pie,4.0,973747765


In [None]:
df = df.pivot_table('rating', index = 'userId', columns = 'title')
df.head()

title,10 Things I Hate About You,12 Angry Men,1408,15 Minutes,16 Blocks,"20,000 Leagues Under the Sea",2001: A Space Odyssey,2046,21 Grams,25th Hour,...,Willy Wonka & the Chocolate Factory,World Trade Center,X-Men Origins: Wolverine,Y Tu Mamá También,You Only Live Twice,"You, Me and Dupree",Young Frankenstein,Zodiac,eXistenZ,xXx
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,3.0,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,,...,,,5.0,,,,5.0,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [None]:
df.shape

(670, 856)

- user들별로 평점을 주는 범위가 다르기 때문에 scaling하여 조정

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns = df.columns, index = df.index)
df_scaled

title,10 Things I Hate About You,12 Angry Men,1408,15 Minutes,16 Blocks,"20,000 Leagues Under the Sea",2001: A Space Odyssey,2046,21 Grams,25th Hour,...,Willy Wonka & the Chocolate Factory,World Trade Center,X-Men Origins: Wolverine,Y Tu Mamá También,You Only Live Twice,"You, Me and Dupree",Young Frankenstein,Zodiac,eXistenZ,xXx
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,0.5,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,0.555556,,,,,...,,,1.0,,,,1.0,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,,,,,,,,,,,...,,,,,,,,,,


## Item-based collaborative filtering
- Item*user으로 구성된 matrix를 사용하여 유사도 측정

In [None]:
df_scaled = df_scaled.transpose().fillna(0)
df_scaled.head(5)

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15 Minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Blocks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- user들 간 평점이 비슷한 정도를 기반으로 영화들의 유사도 평가

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
movie_sim = cosine_similarity(df_scaled, df_scaled)
print(movie_sim.shape)

(856, 856)


In [None]:
movie_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.08998054,
        0.07767356],
       [0.        , 0.        , 0.        , ..., 0.08998054, 1.        ,
        0.0226314 ],
       [0.        , 0.        , 0.        , ..., 0.07767356, 0.0226314 ,
        1.        ]])

In [None]:
movie_sim = pd.DataFrame(movie_sim, index = df_scaled.index, columns = df_scaled.index)

In [None]:
movie_sim['Sin City'].sort_values(ascending=False)[:10]

Unnamed: 0_level_0,Sin City
title,Unnamed: 1_level_1
Sin City,1.0
The Sentinel,0.692308
Freddy vs. Jason,0.667124
Austin Powers in Goldmember,0.24082
The Mummy Returns,0.217571
"Monsters, Inc.",0.208739
Saw IV,0.200063
Dances with Wolves,0.186198
Apocalypse Now,0.176147
"Good Morning, Vietnam",0.168964


#2. Surprise

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357244 sha256=88183e2df9ebafc46ab6c21dc63d6775b926c77fc80ade613758015345af6283
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [None]:
#movies.csv에서 IBCF, SVD rmse를 비교
#df에서 50%만 sampling(sample함수 사용)

#BX-Book-Ratings.zip을 업로드 하셔서, 압축을 풀어주세요
#surprise 데이터셋으로 읽어보세요
!unzip BX-Book-Ratings.zip

In [None]:
df = pd.read_csv('BX-Book-Ratings.csv', sep=";")
df["Book-Rating"].value_counts()  #0~10
df["User-ID"].nunique()
df["ISBN"].nunique()

340557

In [None]:
# Prepare the data to be used in Surprise
reader = Reader(rating_scale=(0,10))
data = Dataset.load_from_df(df[['User-ID', 'ISBN', 'Book-Rating']],
                            reader=reader)

<surprise.dataset.DatasetAutoFolds at 0x7cd2089db4f0>

In [None]:
import pandas as pd
df = pd.read_csv('movies.csv')
df

In [None]:
from surprise import Reader, Dataset, KNNBasic, SVD
from surprise.model_selection import train_test_split, cross_validate
import pandas as pd

# Prepare the data to be used in Surprise
reader = Reader(rating_scale=(0,5))  #컬럼 nunique, value_counts, unique
data = Dataset.load_from_df(df[['userId', 'title', 'rating']], reader=reader)

In [None]:
data.raw_ratings
data.df

In [None]:
sim_options = {
    'name': 'cosine',
    'user_based': True
}
algo = KNNBasic(sim_options=sim_options)

# Retrieve the trainset.
trainset, testset = train_test_split(data)
algo.fit(trainset)

algo

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7cd1d11f3640>

In [None]:
# Predict
print(algo.predict(1, 5, r_ui=None, verbose=False))

user: 1          item: 5          r_ui = None   est = 3.54   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


In [None]:
testset

In [None]:
prediction = algo.test(testset)
prediction[:5]

[Prediction(uid=635, iid='Scarface', r_ui=3.5, est=4.27509034743391, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=550, iid='L.A. Confidential', r_ui=5.0, est=3.612066845316744, details={'actual_k': 14, 'was_impossible': False}),
 Prediction(uid=598, iid='Terminator 3: Rise of the Machines', r_ui=5.0, est=4.0375, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=551, iid='The Lost World: Jurassic Park', r_ui=3.0, est=2.626892580838845, details={'actual_k': 8, 'was_impossible': False}),
 Prediction(uid=646, iid='Mulholland Drive', r_ui=5.0, est=3.2059671243338728, details={'actual_k': 7, 'was_impossible': False})]

- Surprise 지원 알고리즘
 - Random : 랜덤한 추천
 - Baseline : ALS(Alternating Least Square), SGD(Stochastic Gradient Descent)
 - Matrix factorization: SVD, SVD++
 - KNNs
 - Slope one, Co clustering

 - https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html

In [None]:
from surprise import BaselineOnly
from surprise import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import SVD
from surprise import SVDpp

from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt


In [None]:
trainset, testset = train_test_split(data, test_size = 0.25)

In [None]:
recom = KNNBasic()  #CF
recom.fit(trainset)
predictions = recom.test(testset)
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9660


0.9660342076959553

In [None]:
recom = SVD()  #matrix 분해 (SVD) 이용
recom.fit(trainset)
predictions = recom.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9227


0.9227256700213065

- ml-100k에 대해서 여러 추천 알고리즘을 수행하고 비교

In [None]:
# MovieLens 100K
data = Dataset.load_builtin(name='ml-100k', prompt = False)  #ml-1m, jester
data

Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


<surprise.dataset.DatasetAutoFolds at 0x7cd1d0e01cc0>

In [None]:
data.raw_ratings

In [None]:
trainset, testset = train_test_split(data, test_size = 0.25)

In [None]:
testset[:5]

[('560', '480', 3.0),
 ('130', '568', 5.0),
 ('890', '7', 4.0),
 ('82', '1001', 1.0),
 ('222', '1139', 3.0)]

In [None]:
#ml-100k자료-> 9:1파티셔닝, UBCF로 cosine 유사도로 추천모형
#testset에 대한 mae
#개별 user, item에 대한 예측

In [None]:
trainset, testset =train_test_split( data,test_size=0.1)
from surprise import accuracy
algo = KNNBasic(sim_options={'User-based':True, 'name':'cosine'})
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.mae(predictions)  #mse, rmse, mae
algo.predict( '1', '100') #개별user, item에 대한 rating 예측

Prediction(uid='1', iid='100', r_ui=None, est=4.500883384464913, details={'actual_k': 40, 'was_impossible': False})

In [None]:
str(1)    #숫자->문자
#데이터프레임.컬럼.astype( '타입명칭')
#df.user_id.astype('str')
#df.user_id.astype('int')

'1'

In [None]:
algorithms = [KNNBasic, SVD]

algos=[]
rmses=[]

for i in algorithms:
  algo = i()  #model
  algos.append(i.__name__)
  algo.fit(trainset)
  predictions = algo.test(testset)
  rmses.append(accuracy.rmse(predictions))

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9832
RMSE: 0.9338


- cross validate

In [None]:
cross_validate( SVD(), data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9359  0.9327  0.9416  0.9306  0.9384  0.9358  0.0039  
MAE (testset)     0.7387  0.7361  0.7429  0.7339  0.7374  0.7378  0.0030  
Fit time          2.00    1.89    1.39    1.39    1.39    1.61    0.27    
Test time         0.18    0.30    0.12    0.12    0.12    0.17    0.07    


{'test_rmse': array([0.93587088, 0.93266376, 0.94157984, 0.93060883, 0.93837622]),
 'test_mae': array([0.7386503 , 0.73606601, 0.74287378, 0.73387285, 0.73737316]),
 'fit_time': (1.9993460178375244,
  1.8940515518188477,
  1.3909261226654053,
  1.3902513980865479,
  1.389066457748413),
 'test_time': (0.1810469627380371,
  0.30067944526672363,
  0.11614584922790527,
  0.12479853630065918,
  0.11665487289428711)}

#3.개인별 추천

In [None]:
#movies.csv
from surprise import Reader, Dataset, KNNBasic, SVD
from surprise.model_selection import train_test_split, cross_validate
import pandas as pd

df = pd.read_csv('movies.csv')
data= Dataset.load_from_df(df[['userId', 'title', 'rating']], reader=reader)

trainset, testset = train_test_split(data, test_size=0.1)
algo = SVD()#KNNBasic()
algo.fit(trainset)
#predictions = algo.test(testset)
#accuracy.rmse(predictions)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7cd1cd97f0d0>

In [None]:
print( df.userId.unique()[:5] )
print( df.title.unique()[:5] )
algo.predict('1','American Pie')#, 4.0, verbose=True)

In [None]:
rec_result = pd.DataFrame()
for j in df.userId.unique():#[4, 15, 30, 34]:  #userId.unique()
 target=str(j)
 results = list()
 movieid = df.query("userId==@j").title.unique()
 titles = df.query( 'title not in @movieid' ).title.unique()
 for i in titles:
  results.append( algo.predict(target,i )   )
 rec_result=pd.concat([rec_result, pd.DataFrame(results).nlargest(3, 'est')], axis=0)

In [136]:
rec_result

Unnamed: 0,uid,iid,r_ui,est,details
164,1,Galaxy Quest,,4.381703,{'was_impossible': False}
227,1,The Good Thief,,4.377383,{'was_impossible': False}
52,1,The Thomas Crown Affair,,4.363992,{'was_impossible': False}
119,4,Galaxy Quest,,4.381703,{'was_impossible': False}
182,4,The Good Thief,,4.377383,{'was_impossible': False}
...,...,...,...,...,...
229,76,The Good Thief,,4.377383,{'was_impossible': False}
54,76,The Thomas Crown Affair,,4.363992,{'was_impossible': False}
166,258,Galaxy Quest,,4.381703,{'was_impossible': False}
229,258,The Good Thief,,4.377383,{'was_impossible': False}
