<a href="https://colab.research.google.com/github/yutak1017/RPG_GAME/blob/main/Week12_2_ItemRecommender(Project)%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 12-2: ItemRecommender (Project)
**Date:** 2025-11-19  
**Instructor:** Hong-Kyun Bae, Kookmin University

---

## [Contents]

**Content-based Recommendation**

   - MovieLens 데이터셋 로드
   - MovieLens 데이터셋을 이용하여 컨텐츠 벡터 생성
   - 코사인 유사도를 기반으로 각 아이템과 유사한 아이템 확인
   - 각 유저에게 top-k 아이템 추천

---

In [1]:
# 라이브러리 로드
import pandas as pd
import numpy as np

### 1. MovieLens 데이터셋 불러오기

In [2]:
# MovieLens 데이터셋 다운로드 및 압축 해제
!wget -nc https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -n ml-latest-small.zip

--2025-12-18 13:45:06--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2025-12-18 13:45:06 (6.67 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [3]:
# movies.csv, ratings.csv 데이터 로드
movies = pd.read_csv("ml-latest-small/movies.csv")       # movieId, title, genres
ratings = pd.read_csv("ml-latest-small/ratings.csv")     # userId, movieId, rating, timestamp

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


### 2. 컨텐츠 벡터 생성 (feature engineering)

- 장르 (genre): multi-hot 인코딩
- 평균 평점 (avg_rating)
- 평점 수 (rating_count)
- 개봉 연도 (year)

In [6]:
# A) 영화별 평균 평점, 평점 수 계산

grouped = ratings.groupby("movieId")
grouped_rating = grouped["rating"]
agg_values = grouped_rating.agg(["mean", "count"])
agg = agg_values.reset_index()
# agg = ratings.groupby("movieId")["rating"].agg(["mean", "count"]).reset_index()    # 위의 네 줄을 한 줄로 표현

agg.columns = ["movieId", "avg_rating", "rating_count"]

In [7]:
agg.head()

Unnamed: 0,movieId,avg_rating,rating_count
0,1,3.92093,215
1,2,3.431818,110
2,3,3.259615,52
3,4,2.357143,7
4,5,3.071429,49


In [8]:
# B) 컨텐츠 벡터와의 결합

# movies와 agg 간의 merge
movies = movies.merge(agg, on="movieId", how="left")

# 결측치 처리 (평점이 없는 영화)
movies["avg_rating"] = movies["avg_rating"].fillna(movies["avg_rating"].mean())
movies["rating_count"] = movies["rating_count"].fillna(0)

movies.head()

Unnamed: 0,movieId,title,genres,avg_rating,rating_count
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,110.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,52.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,7.0
4,5,Father of the Bride Part II (1995),Comedy,3.071429,49.0


In [9]:
# C) 장르를 multi-hot 인코딩으로 변환
genres_dummies = movies["genres"].str.get_dummies(sep="|")
genres_dummies.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
# D) 영화 제목에서 연도 (year) 추출
movies["year"] = movies["title"].str.extract(r"\((\d{4})\)").astype(float)

# 연도가 없는 경우 중앙값으로 대체
movies["year"] = movies["year"].fillna(movies["year"].median())

# rating_count 로그 변환, year 표준화 (다른 feature 값들과의 스케일을 맞춰주기 위함 --> 올바른 유사도 계산)
movies["log_rating_count"] = np.log1p(movies["rating_count"])
movies["year_scaled"] = (movies["year"] - movies["year"].mean()) / movies["year"].std()    # Z-score 표준화 수행

movies[["movieId", "title", "avg_rating", "rating_count", "log_rating_count", "year_scaled"]].head()

Unnamed: 0,movieId,title,avg_rating,rating_count,log_rating_count,year_scaled
0,1,Toy Story (1995),3.92093,215.0,5.375278,0.020542
1,2,Jumanji (1995),3.431818,110.0,4.70953,0.020542
2,3,Grumpier Old Men (1995),3.259615,52.0,3.970292,0.020542
3,4,Waiting to Exhale (1995),2.357143,7.0,2.079442,0.020542
4,5,Father of the Bride Part II (1995),3.071429,49.0,3.912023,0.020542


In [11]:
# E) 최종 컨텐츠 벡터 구성
feature_cols = list(genres_dummies.columns) + ["avg_rating", "log_rating_count", "year_scaled"]

item_vectors = pd.concat(
    [genres_dummies, movies[["avg_rating", "log_rating_count", "year_scaled"]]],
    axis=1
)

print("벡터 차원 수:", item_vectors.shape[1])
item_vectors.head()

벡터 차원 수: 23


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,avg_rating,log_rating_count,year_scaled
0,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,3.92093,5.375278,0.020542
1,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,3.431818,4.70953,0.020542
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,3.259615,3.970292,0.020542
3,0,0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,2.357143,2.079442,0.020542
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,3.071429,3.912023,0.020542


### 3. 유사한 아이템 찾기

타겟 아이템의 벡터와 모든 영화 아이템 벡터 사이의 코사인 유사도 직접 계산 후, 타겟 아이템과 가장 유사한 top-K개의 영화 아이템들을 찾음

In [12]:
movieid_to_vec = {
    int(mid): item_vectors.iloc[i].values.astype("float32")
    for i, mid in enumerate(movies["movieId"])
}

movie_ids = list(movieid_to_vec.keys())

list(movieid_to_vec.items())[:3]    # 첫 3개의 영화 컨텐츠 벡터 정보 출력

[(1,
  array([0.        , 0.        , 1.        , 1.        , 1.        ,
         1.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         3.9209301 , 5.3752785 , 0.02054237], dtype=float32)),
 (2,
  array([0.        , 0.        , 1.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         3.4318182 , 4.7095304 , 0.02054237], dtype=float32)),
 (3,
  array([0.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         3.2596154 , 3.9702919 

In [13]:
def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    """두 벡터 a, b 사이의 코사인 유사도 계산"""
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))

In [14]:
def find_similar_items(target_mid: int, return_all: bool = False, k: int = 5):
    """
    타겟 아이템과 모든 영화와의 코사인 유사도 계산
    - return_all=True 일 경우, 유사도 전체 리스트를 내림차순으로 반환
    - return_all=False 일 경우, 상위 k개만 반환
    """
    target_vec = movieid_to_vec[target_mid]
    sims = []

    for mid in movie_ids:
        if mid == target_mid:
            continue
        sim = cosine_sim(target_vec, movieid_to_vec[mid])
        sims.append((mid, sim))

    # 유사도 기준 내림차순 정렬 (가장 유사한 것부터 차례대로)
    sims.sort(key=lambda x: x[1], reverse=True)

    if return_all:
        return sims
    else:
        return sims[:k]

In [15]:
# Test
k = 5

mid = int(movies.sample(1)["movieId"].iloc[0])
target_row = movies.loc[movies.movieId == mid].iloc[0]

print("===타겟 영화===")
print(f"movieId: {mid}")
print(f"title: {target_row['title']}")
print(f"genres: {target_row['genres']}")
print(f"avg_rating: {target_row['avg_rating']}")
print(f"rating_count: {target_row['rating_count']}")
print(f"year: {target_row['year']}")
print()

all_sims = find_similar_items(mid, True, k)

# 1. 가장 유사한 5개의 영화
best_k = all_sims[:k]
print("===가장 유사한 5개의 영화===")
for mid, sim in best_k:
  row = movies.loc[movies.movieId == mid].iloc[0]
  print(f"[sim={sim:.4f}]")
  print(f"movieId: {mid}")
  print(f"title: {row['title']}")
  print(f"genres: {row['genres']}")
  print(f"avg_rating: {row['avg_rating']}")
  print(f"rating_count: {row['rating_count']}")
  print(f"year: {row['year']}")
  print()

# 2. 가장 유사하지 않은 5개의 영화
worst_k = all_sims[-k:]
print("===가장 유사하지 않은 5개의 영화===")
for mid, sim in worst_k:
  row = movies.loc[movies.movieId == mid].iloc[0]
  print(f"[sim={sim:.4f}]")
  print(f"movieId: {mid}")
  print(f"title: {row['title']}")
  print(f"genres: {row['genres']}")
  print(f"avg_rating: {row['avg_rating']}")
  print(f"rating_count: {row['rating_count']}")
  print(f"year: {row['year']}")
  print()

===타겟 영화===
movieId: 70206
title: Collector, The (2009)
genres: Crime|Horror|Thriller
avg_rating: 3.5
rating_count: 1.0
year: 2009.0

===가장 유사한 5개의 영화===
[sim=0.9981]
movieId: 60832
title: Pathology (2008)
genres: Crime|Horror|Thriller
avg_rating: 4.0
rating_count: 1.0
year: 2008.0

[sim=0.9962]
movieId: 84189
title: I Spit on Your Grave (2010)
genres: Crime|Horror|Thriller
avg_rating: 3.75
rating_count: 2.0
year: 2010.0

[sim=0.9934]
movieId: 112727
title: Deliver Us from Evil (2014)
genres: Crime|Horror|Thriller
avg_rating: 3.0
rating_count: 1.0
year: 2014.0

[sim=0.9817]
movieId: 63436
title: Saw V (2008)
genres: Crime|Horror|Thriller
avg_rating: 3.1666666666666665
rating_count: 3.0
year: 2008.0

[sim=0.9815]
movieId: 4645
title: Cure (1997)
genres: Crime|Horror|Thriller
avg_rating: 4.5
rating_count: 1.0
year: 1997.0

===가장 유사하지 않은 5개의 영화===
[sim=0.0832]
movieId: 5560
title: À nous la liberté (Freedom for Us) (1931)
genres: Comedy|Musical
avg_rating: 1.0
rating_count: 1.0
year: 1931

###4. 유저별 top-k 아이템 추천

In [16]:
# A) 유저별 8:2의 비율로 트레이닝/테스트셋 분할

train_idx = []
test_idx = []

for uid, user_df in ratings.groupby("userId"):
    # 유저별로 각 유저가 본 영화들을 랜덤하게 섞음 (랜덤 시드는 42로 고정)
    user_df = user_df.sample(frac=1, random_state=42)
    n = len(user_df)
    if n == 1:
        # 평점을 1개만 갖고 있는 유저는 트레이닝셋에만 넣음 (테스트에선 제외)
        train_idx.extend(user_df.index)
        continue
    n_train = max(1, int(n * 0.8))  # 트레이닝셋 사이즈
    train_idx.extend(user_df.iloc[:n_train].index)  # 유저의 트레이닝셋에 속하는 행들의 인덱스를 각각의 원소로서 저장
    test_idx.extend(user_df.iloc[n_train:].index)

train = ratings.loc[train_idx].copy()  # train_idx에 모아둔 모든 유저의 트레이닝셋 행(인덱스)들을 이용하여 전체 트레이닝셋 구성
test  = ratings.loc[test_idx].copy()

print("전체:", len(ratings))
print("트레이닝셋 사이즈:", len(train))
print("테스트셋 사이즈:", len(test))


전체: 100836
트레이닝셋 사이즈: 80419
테스트셋 사이즈: 20417


In [17]:
# B) 유저별 트레이닝/테스트 아이템 집합 구성

user_train_items = train.groupby("userId")["movieId"].apply(set)  # 유저별 트레이닝셋에서의 아이템 집합
user_test_items  = test.groupby("userId")["movieId"].apply(set)  # 유저별 테스트셋에서의 아이템 집합

user_train_items.head()

Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
1,"{2048, 1024, 1, 1029, 2054, 1031, 1032, 6, 205..."
2,"{115713, 122882, 80906, 114060, 131724, 77455,..."
3,"{5764, 647, 3210, 527, 914, 1302, 31, 2080, 59..."
4,"{1025, 3591, 3079, 2571, 3083, 21, 1046, 2583,..."
5,"{1, 515, 261, 266, 527, 531, 21, 534, 150, 153..."


In [18]:
# C) 각 유저에 대해 top-k 아이템 추천 후, 정확도 평가

import random
random.seed(42)

def evaluate_precision_recall_at_k(k: int = 5):
    precisions = []
    recalls = []

    # 트레이닝셋 및 테스트셋 둘 다에 속하는 유저만 평가 대상
    eval_users = set(user_train_items.index) & set(user_test_items.index)

    for uid in eval_users:
        train_items = list(user_train_items[uid])
        test_items  = list(user_test_items[uid])

        # 트레이닝 또는 테스트 아이템 없을 경우, 현재 유저는 스킵 (논리적으론 불필요 but, 안정성 보장)
        if len(test_items) == 0 or len(train_items) == 0:
            continue

        # 현재 유저가 트레이닝셋에서 본 영화 중 임의의 한 편을 대표 영화로서 선택
        # (movieid_to_vec에 벡터가 있는 영화만 선택)
        random.shuffle(train_items)
        target_mid = None
        for mid in train_items:
            if mid in movieid_to_vec:
                target_mid = mid
                break
        if target_mid is None:
            continue  # 현재 유저는 대표 영화로서 사용 가능한 영화가 없음 (스킵)

        # 유저의 대표 영화 기반 top-k 추천
        recs = find_similar_items(target_mid, k=k)
        rec_movie_ids = [mid for (mid, sim) in recs]

        # hits: 추천 목록 중에서 이 유저가 테스트셋에서도 본 영화들 (즉, 맞힌 아이템들)
        test_set = set(test_items)
        hit_set = set(rec_movie_ids) & test_set

        # precision, recall: 추천 정확도 평가 메트릭
        precision = len(hit_set) / k
        recall    = len(hit_set) / len(test_set)

        precisions.append(precision)
        recalls.append(recall)

    if not precisions:
        return 0.0, 0.0

    return float(np.mean(precisions)), float(np.mean(recalls))


In [19]:
K = 5
p_at_k, r_at_k = evaluate_precision_recall_at_k(k=K)

print(f"Precision@{K}: {p_at_k:.4f}")
print(f"Recall@{K}   : {r_at_k:.4f}")

Precision@5: 0.0328
Recall@5   : 0.0095


#### 랜덤 추천 (베이스라인)

In [20]:
import random
random.seed(42)

all_movie_ids = list(movieid_to_vec.keys())  # 벡터가 존재하는 영화들

def evaluate_random_at_k(k: int = 5):
    precisions = []
    recalls = []

    eval_users = set(user_train_items.index) & set(user_test_items.index)

    for uid in eval_users:
        train_items = list(user_train_items[uid])
        test_items  = list(user_test_items[uid])

        if len(test_items) == 0 or len(train_items) == 0:
            continue

        # 랜덤 추천
        # 유저가 이미 본 영화는 제외
        seen_items = set(train_items)
        candidate_items = [mid for mid in all_movie_ids if mid not in seen_items]

        # 후보 영화가 k개 미만일 경우 스킵
        if len(candidate_items) < k:
            continue

        # 랜덤 샘플링으로 k개 추천
        rec_movie_ids = random.sample(candidate_items, k)

        hit_set = set(rec_movie_ids) & set(test_items)

        precision = len(hit_set) / k
        recall    = len(hit_set) / len(test_items)

        precisions.append(precision)
        recalls.append(recall)

    if not precisions:
        return 0.0, 0.0

    return float(np.mean(precisions)), float(np.mean(recalls))


In [21]:
K = 5
p_rand, r_rand = evaluate_random_at_k(k=K)

print(f"Precision@{K}: {p_rand:.4f}")
print(f"Recall@{K}   : {r_rand:.4f}")

Precision@5: 0.0043
Recall@5   : 0.0009
