In [None]:
# !pip install numpy==1.26.4

## Step4: 활용사례 - 그룹화된 고객과 판매자를 활용하여 만든 추천시스템

### 문제 30. 추천시스템에서 사용할 데이터 불러오기  

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [10]:
review_data_with_labels_url = "https://raw.githubusercontent.com/dajeong-lecture/raw_data/main/review_data.csv"

review_df = pd.read_csv(review_data_with_labels_url, parse_dates=[0])

In [11]:
review_df.head()

Unnamed: 0,user,item,item_group,rating
0,8EzOblo9t562yOY,vgfAkNkq0qlFOhh,3,5.0
1,lJkfoUeURQwRQzX,vgfAkNkq0qlFOhh,3,5.0
2,JmYADnKKqtJ0GYi,vgfAkNkq0qlFOhh,3,5.0
3,YehuD30VL5pIcer,6Ay7eqTEAoIlzLg,13,4.0
4,In5agUcBmfm9Ut4,6Ay7eqTEAoIlzLg,13,5.0


### 문제 31. 추천시스템에서 사용할 리뷰데이터 탐색하기

In [12]:
review_df.user.nunique(), review_df.item.nunique()

(1156, 25032)

In [13]:
review_df.item_group.nunique()

49

In [14]:
review_df.groupby("item_group").count()[["item"]]

Unnamed: 0_level_0,item
item_group,Unnamed: 1_level_1
0,2532
1,1452
2,1471
3,2007
4,652
5,1555
6,1365
7,1567
8,976
9,2166


In [15]:
fig = px.histogram(review_df, x="rating")
fig.update_layout(height=500, width=700)
fig.show()

### 문제 32. 리뷰데이터의 item_group 기준으로 평균 rating 구하기

In [16]:
review_by_group_df = review_df.groupby(["user", "item_group"])[["rating"]].mean().reset_index()
review_by_group_df.head(5)

Unnamed: 0,user,item_group,rating
0,023s6EPaKsQfXqE,18,4.0
1,023s6EPaKsQfXqE,25,5.0
2,03Opy95xbHus7pi,2,4.75
3,03Opy95xbHus7pi,3,4.333333
4,03Opy95xbHus7pi,4,4.0


In [17]:
fig = px.histogram(review_by_group_df, x="rating")
fig.update_layout(height=500, width=700)
fig.show()

### 문제 33. 추천시스템을 위한 surprise 모듈 import하기

In [18]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505219 sha256=775eaec6b734c544a320bdfbf2a35ed90496c75e4fbdb0d32f7f5f23c7cff4e1
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e28991

In [19]:
# 필요한 모듈을 import 해주세요.
from surprise import Dataset, Reader
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV


### 문제 34. surprise 모듈에서 사용할 수 있도록 DataFrame 변형하기


In [20]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(review_by_group_df, reader)
train_data.raw_ratings[:10]

[('023s6EPaKsQfXqE', 18, 4.0, None),
 ('023s6EPaKsQfXqE', 25, 5.0, None),
 ('03Opy95xbHus7pi', 2, 4.75, None),
 ('03Opy95xbHus7pi', 3, 4.333333333333333, None),
 ('03Opy95xbHus7pi', 4, 4.0, None),
 ('03Opy95xbHus7pi', 7, 3.6666666666666665, None),
 ('03Opy95xbHus7pi', 8, 4.5, None),
 ('03Opy95xbHus7pi', 9, 4.0, None),
 ('03Opy95xbHus7pi', 10, 4.612903225806452, None),
 ('03Opy95xbHus7pi', 14, 4.0, None)]

### 문제 35. surprise 모듈로 KNN 최적의 파라미터 찾기


In [21]:
# sim_options = {
#     "name": ["cosine", "msd"],
#     "min_support": [3, 4, 5],
#     "user_based": [False, True],
# }

sim_options = {
    "name": ["msd"],
    "min_support": [4, 5],
    "user_based": [False],
}
param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(train_data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
0.9555218403130298
{'sim_options': {'name': 'msd', 'min_support': 5, 'user_based': False}}


In [22]:
pd.DataFrame(gs.cv_results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_sim_options
0,0.964515,0.942787,0.959286,0.95553,0.00926,2,0.693929,0.684141,0.691866,0.689979,0.004213,2,0.006992,0.000962,0.095607,0.007336,"{'sim_options': {'name': 'msd', 'min_support':...","{'name': 'msd', 'min_support': 4, 'user_based'..."
1,0.96436,0.942996,0.95921,0.955522,0.009103,1,0.693446,0.684468,0.691914,0.689943,0.003921,1,0.007144,0.001344,0.086779,0.002709,"{'sim_options': {'name': 'msd', 'min_support':...","{'name': 'msd', 'min_support': 5, 'user_based'..."


### 문제 36. 최적의 파라미터로 추천시스템 훈련시키기


In [33]:
sim_options = {"name": "msd", "min_support": 5, "user_based": False}
knn_algo = KNNWithMeans(sim_options=sim_options)

In [34]:
trainset = train_data.build_full_trainset()
knn_algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7a890b92d210>

### 문제 37. 훈련된 추천시스템 모델의 결과 확인하기 (prediction)


In [35]:
review_by_group_df.head()

Unnamed: 0,user,item,rating
0,023s6EPaKsQfXqE,18,4.0
1,023s6EPaKsQfXqE,25,5.0
2,03Opy95xbHus7pi,2,4.75
3,03Opy95xbHus7pi,3,4.333333
4,03Opy95xbHus7pi,4,4.0


In [36]:
review_by_group_df[review_by_group_df.user=="023s6EPaKsQfXqE"]

Unnamed: 0,user,item,rating
0,023s6EPaKsQfXqE,18,4.0
1,023s6EPaKsQfXqE,25,5.0


In [37]:
prediction = knn_algo.predict(uid="023s6EPaKsQfXqE", iid=18)
prediction

Prediction(uid='023s6EPaKsQfXqE', iid=18, r_ui=None, est=4.246930084936472, details={'actual_k': 2, 'was_impossible': False})

In [38]:
prediction = knn_algo.predict(uid="023s6EPaKsQfXqE", iid=19)
prediction

Prediction(uid='023s6EPaKsQfXqE', iid=19, r_ui=None, est=4.4311969814238195, details={'actual_k': 2, 'was_impossible': False})

### 문제 38. 훈련된 모델을 사용하여 user에게 item을 추천해주는 함수 만들기


In [39]:
review_by_group_df.columns = ["user", "item", "rating"]

In [40]:
review_by_group_df.columns = ["user", "item", "rating"]

In [41]:
# 추천함수 생성하기
def recommender_func(algo, df, uid, item_cnt=30):
  item_group_cnt = df.item.nunique()
  user_hist_df = df[df.user == uid]
  rating_dict = user_hist_df.set_index("item").to_dict()["rating"]

  for i in range(0, item_group_cnt+1):
      if i not in rating_dict:
          rating_dict[i] = round(algo.predict(uid=uid, iid=i).est,2)
          print(i, " >> " ,rating_dict[i])
      else: print(i, " >> " ,rating_dict[i], round(algo.predict(uid=uid, iid=i).est,2))

  pre_rating_df = pd.DataFrame(rating_dict.items(), columns = ["item_id", "pre_rating"])
  pre_rating_df = pre_rating_df.sort_values(by="pre_rating", ascending=False)
  iid_list = pre_rating_df.head(item_cnt)["item_id"].to_list()

  return pre_rating_df, iid_list


In [42]:
pre_rating_df, iid_list = recommender_func(knn_algo, review_by_group_df, uid="023s6EPaKsQfXqE", item_cnt=30)

0  >>  4.35
1  >>  4.46
2  >>  4.3
3  >>  4.45
4  >>  4.31
5  >>  4.42
6  >>  4.47
7  >>  4.61
8  >>  4.33
9  >>  4.53
10  >>  4.66
11  >>  4.46
12  >>  4.48
13  >>  4.41
14  >>  4.56
15  >>  4.45
16  >>  4.37
17  >>  4.48
18  >>  4.0 4.25
19  >>  4.43
20  >>  4.43
21  >>  4.51
22  >>  4.09
23  >>  4.28
24  >>  4.24
25  >>  5.0 4.75
26  >>  4.49
27  >>  4.29
28  >>  4.53
29  >>  4.53
30  >>  4.51
31  >>  4.54
32  >>  4.55
33  >>  4.58
34  >>  4.69
35  >>  4.17
36  >>  4.28
37  >>  4.61
38  >>  4.48
39  >>  4.24
40  >>  4.32
41  >>  4.67
42  >>  4.39
43  >>  4.62
44  >>  2.98
45  >>  4.54
46  >>  4.52
47  >>  4.48
48  >>  4.51
49  >>  4.17
