<a href="https://colab.research.google.com/github/yutak1017/RPG_GAME/blob/main/Week12_2_ItemRecommender(Project)_VectorDB%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 12-2: ItemRecommender (Project) with Vector DB
**Date:** 2025-11-19  
**Instructor:** Hong-Kyun Bae, Kookmin University

---

## [Contents]

**Content-based Recommendation with Qdrant Vector DB**


---

In [1]:
# Qdrant 라이브러리 설치
!pip install qdrant-client --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/377.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m256.0/377.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m377.2/377.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# 라이브러리 로드
import pandas as pd
import numpy as np

### 1. MovieLens 데이터셋 불러오기

In [3]:
# MovieLens 데이터셋 다운로드 및 압축 해제
!wget -nc https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -n ml-latest-small.zip

--2025-12-18 13:52:38--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2025-12-18 13:52:38 (6.60 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [4]:
# movies.csv, ratings.csv 데이터 로드
movies = pd.read_csv("ml-latest-small/movies.csv")       # movieId, title, genres
ratings = pd.read_csv("ml-latest-small/ratings.csv")     # userId, movieId, rating, timestamp

### 2. 컨텐츠 벡터 생성 (feature engineering)

- 장르 (genre): multi-hot 인코딩
- 평균 평점 (avg_rating)
- 평점 수 (rating_count)
- 개봉 연도 (year)

In [5]:
# A) 영화별 평균 평점, 평점 수 계산

grouped = ratings.groupby("movieId")
grouped_rating = grouped["rating"]
agg_values = grouped_rating.agg(["mean", "count"])
agg = agg_values.reset_index()
# agg = ratings.groupby("movieId")["rating"].agg(["mean", "count"]).reset_index()    # 위의 네 줄을 한 줄로 표현

agg.columns = ["movieId", "avg_rating", "rating_count"]

In [6]:
# B) 컨텐츠 벡터와의 결합

# movies와 agg 간의 merge
movies = movies.merge(agg, on="movieId", how="left")

# 결측치 처리 (평점이 없는 영화)
movies["avg_rating"] = movies["avg_rating"].fillna(movies["avg_rating"].mean())
movies["rating_count"] = movies["rating_count"].fillna(0)

movies.head()

Unnamed: 0,movieId,title,genres,avg_rating,rating_count
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,110.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,52.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,7.0
4,5,Father of the Bride Part II (1995),Comedy,3.071429,49.0


In [7]:
# C) 장르를 multi-hot 인코딩으로 변환
genres_dummies = movies["genres"].str.get_dummies(sep="|")
genres_dummies.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# D) 영화 제목에서 연도 (year) 추출
movies["year"] = movies["title"].str.extract(r"\((\d{4})\)").astype(float)

# 연도가 없는 경우 중앙값으로 대체
movies["year"] = movies["year"].fillna(movies["year"].median())

# rating_count 로그 변환, year 표준화 (다른 feature 값들과의 스케일을 맞춰주기 위함 --> 올바른 유사도 계산)
movies["log_rating_count"] = np.log1p(movies["rating_count"])
movies["year_scaled"] = (movies["year"] - movies["year"].mean()) / movies["year"].std()    # Z-score 표준화 수행

movies[["movieId", "title", "avg_rating", "rating_count", "log_rating_count", "year_scaled"]].head()

Unnamed: 0,movieId,title,avg_rating,rating_count,log_rating_count,year_scaled
0,1,Toy Story (1995),3.92093,215.0,5.375278,0.020542
1,2,Jumanji (1995),3.431818,110.0,4.70953,0.020542
2,3,Grumpier Old Men (1995),3.259615,52.0,3.970292,0.020542
3,4,Waiting to Exhale (1995),2.357143,7.0,2.079442,0.020542
4,5,Father of the Bride Part II (1995),3.071429,49.0,3.912023,0.020542


In [9]:
# E) 최종 컨텐츠 벡터 구성
feature_cols = list(genres_dummies.columns) + ["avg_rating", "log_rating_count", "year_scaled"]

item_vectors = pd.concat(
    [genres_dummies, movies[["avg_rating", "log_rating_count", "year_scaled"]]],
    axis=1
)

print("벡터 차원 수:", item_vectors.shape[1])
item_vectors.head()

벡터 차원 수: 23


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,avg_rating,log_rating_count,year_scaled
0,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,3.92093,5.375278,0.020542
1,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,3.431818,4.70953,0.020542
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,3.259615,3.970292,0.020542
3,0,0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,2.357143,2.079442,0.020542
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,3.071429,3.912023,0.020542


### 3. 유사한 아이템 찾기 (with Qdrant Vector DB)

아이템 컨텐츠 벡터를 Qdrant Vector DB에 저장. 그 다음, Qdrant의 search 기능을 통해 유사한 아이템 찾음

In [10]:
# Qdrant 라이브러리 로드
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

### (1) 인메모리 Qdrant 인스턴스 생성 & 컬렉션 생성

In [11]:
'''
인메모리 (in-memory) Qdrant 인스턴스 생성
(즉, 디스크가 아닌 메모리 상에서만 동작하는 인스턴스)
'''
client = QdrantClient(":memory:")

vector_dim = item_vectors.shape[1]  # 아이템 컨텐츠 벡터 테이블(item_vectors)의 열 개수
print("벡터 차원:", vector_dim)

'''
아이템 벡터들이 저장될 컬렉션 생성
distance: Distance.COSINE, Distance.DOT, Distance.EUCLID, Distance.MANHATTAN
'''
client.recreate_collection(
    collection_name="movies",
    vectors_config=VectorParams(size=vector_dim, distance=Distance.COSINE)  # size: 각 벡터의 차원 수, distance: 코사인 유사도 기준
)

벡터 차원: 23


  client.recreate_collection(


True

### (2) Qdrant에 넣을 포인트 (point) 구성

- Point: Qdrant에 upsert (삽입) 할 데이터 목록을 담는 리스트
- Payload: 벡터와 함께 저장될 메타데이터



In [12]:
# 포인트 (point) 구성
from qdrant_client.models import PointStruct

points = []

for i, mid in enumerate(movies["movieId"]):
    vec = item_vectors.iloc[i].values.astype("float32")  # 벡터를 float32 타입으로 변환하여 vec에 저장
    title = movies.iloc[i]["title"]

    points.append(
        PointStruct(
            id=int(mid),
            vector=vec.tolist(),
            payload={  # 벡터와 함께 저장될 메타데이터
                "movieId": int(mid),
                "title": title,
                "genres": movies.iloc[i]["genres"]
            }
        )
    )

len(points)

9742

### (3) Qdrant에 벡터 및 메타데이터 업로드
- upsert: 삽입 (insert) 및 갱신 (update) 연산이 합쳐진 개념

In [13]:
'''
movies 컬렉션에 points 리스트를 삽입 (upsert)

코드 수행 후 Qdrant 안에 저장된 정보
: 각 영화에 대한
 - id
 - vector (컨텐츠 벡터)
 - payload = {movieid, title, genres}
'''

client.upsert(collection_name="movies", points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

### (4) Qdrant를 이용한 유사 아이템 찾기

Qdrant 내부에서 자동으로 ANN (Approximate Nearest Neighbor) 검색 수행

In [14]:
movieid_to_vec = {
    int(mid): item_vectors.iloc[i].values.astype("float32")
    for i, mid in enumerate(movies["movieId"])
}

In [15]:
def recommend_similar_items_qdrant(target_mid: int, return_all: bool = False, k: int = 5):
    """
    Qdrant 벡터 DB의 search 기능을 이용하여
    유사한 영화 top-K를 추천
    """
    target_vec = movieid_to_vec[target_mid]

    # movies 컬렉션에서 target_vec과 가까운 벡터들을 limit 개만큼 검색
    results = client.query_points(
        collection_name="movies",
        query=target_vec,
        limit=k+1,  # target_vec 자신이 포함될 경우 고려
        with_payload=True
    )

    '''
    results (Qdrant가 반환한 검색 결과 리스트) 의 각 hit에 담겨 있는 정보
     - point ID
     - score (코사인 유사도)
     - payload (메타데이터)
    '''
    recs = []
    for hit in results.points:
        mid = hit.payload["movieId"]
        if mid == target_mid:
            continue
        recs.append((mid, hit.score))

        if len(recs) >= k:
            break

    return recs

### (5) 테스트

In [16]:
mid = int(movies.sample(1)["movieId"].iloc[0])
target_row = movies.loc[movies.movieId == mid].iloc[0]

print("타겟 영화:", movies.loc[movies.movieId == mid, "title"].values[0])
print()

topk_qd = recommend_similar_items_qdrant(mid, k=5)
for mid, score in topk_qd:
    title = movies.loc[movies.movieId == mid, "title"].values[0]
    print(f"{title} (score={score:.4f})")

타겟 영화: 1941 (1979)

Catch-22 (1970) (score=0.9911)
Bananas (1971) (score=0.9842)
Best Defense (1984) (score=0.9803)
Kelly's Heroes (1970) (score=0.9657)
Russians Are Coming, the Russians Are Coming, The (1966) (score=0.9566)
