# Baseline Item-based Collaborative Filtering (Raw Ratings)

편향 보정이나 추가 EDA 없이 **raw rating만 사용하는** Item-based 협업필터링 베이스라인 모델입니다.

구성:
1. 데이터 로드
2. 벡터화 (user-item, item-user 행렬)
3. 아이템 간 유사도 (dot product)
4. 추천 함수 구현 (유저 기준 / 영화 기준)
5. 예측 함수 구현
6. 예측용 테스트 데이터 추출
7. 예측 평점 조인
8. MAE / MSE / RMSE 성능 평가


## 1. 데이터 로드

In [8]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from IPython.display import display

# 데이터 경로 설정 (필요시 수정)
DATA_DIR = Path("data")

MOVIE_DATA_PATH = DATA_DIR / "movies_refined.csv"
RATING_DATA_PATH = DATA_DIR / "ratings_refined.csv"

print("MOVIE_DATA_PATH:", MOVIE_DATA_PATH)
print("RATING_DATA_PATH:", RATING_DATA_PATH)

# CSV 로드
movies_df = pd.read_csv(MOVIE_DATA_PATH)
ratings_df = pd.read_csv(RATING_DATA_PATH)

# 타입 정리 (id는 정수, rating은 float)
movies_df["movie_id"] = movies_df["movie_id"].astype(int)
ratings_df["user_id"] = ratings_df["user_id"].astype(int)
ratings_df["movie_id"] = ratings_df["movie_id"].astype(int)

print("movies_df shape:", movies_df.shape)
print("ratings_df shape:", ratings_df.shape)

display(movies_df.head())
display(ratings_df.head())

MOVIE_DATA_PATH: data/movies_refined.csv
RATING_DATA_PATH: data/ratings_refined.csv
movies_df shape: (9703, 4)
ratings_df shape: (100785, 4)


Unnamed: 0,movie_id,title,year,genres
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),1995,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),1995,Comedy Romance
3,4,Waiting to Exhale (1995),1995,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),1995,Comedy


Unnamed: 0,user_id,movie_id,rating,timestamp
0,429,420,2.0,828124615
1,429,222,4.0,828124615
2,429,343,3.0,828124615
3,429,349,3.0,828124615
4,429,351,4.0,828124615


## 2. 벡터화: user-item / item-user 행렬 만들기

In [9]:
# 영화-유저 평점 테이블 (title 포함) - 분석 및 평가용
movie_ratings_df = pd.merge(
    ratings_df,
    movies_df[["movie_id", "title"]],
    on="movie_id",
    how="left",
)

print("movie_ratings_df shape:", movie_ratings_df.shape)
display(movie_ratings_df.head())

# user_id × movie_id 행렬 (값 = raw rating)
#  - index: user_id
#  - columns: movie_id
user_item_matrix = movie_ratings_df.pivot_table(
    values="rating",
    index="user_id",
    columns="movie_id",
    fill_value=0.0,  # 안 본 영화는 0점 처리 (baseline)
)

print("user_item_matrix shape:", user_item_matrix.shape)
display(user_item_matrix.head())

# item-user 행렬 (영화 × 유저)
item_user_matrix = user_item_matrix.T

item_ids = item_user_matrix.index.to_numpy()   # 영화 ID 순서
user_ids = item_user_matrix.columns.to_numpy() # 유저 ID 순서

item_matrix = item_user_matrix.to_numpy()

print("item_user_matrix shape:", item_user_matrix.shape)

movie_ratings_df shape: (100785, 5)


Unnamed: 0,user_id,movie_id,rating,timestamp,title
0,429,420,2.0,828124615,Beverly Hills Cop III (1994)
1,429,222,4.0,828124615,Circle of Friends (1995)
2,429,343,3.0,828124615,"Baby-Sitters Club, The (1995)"
3,429,349,3.0,828124615,Clear and Present Danger (1994)
4,429,351,4.0,828124615,"Corrina, Corrina (1994)"


user_item_matrix shape: (610, 9685)


movie_id,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


item_user_matrix shape: (9685, 610)


## 3. 아이템 간 유사도 (dot product 기반 item-item similarity)

In [10]:
# item-item 유사도 행렬 (dot product)
#  - sim[i, j] = 영화 i 벡터와 영화 j 벡터의 내적
item_sim_matrix = np.dot(item_matrix, item_matrix.T)

# 확인용 DataFrame (index/columns = movie_id)
item_sim_df = pd.DataFrame(
    item_sim_matrix,
    index=item_ids,
    columns=item_ids,
)

print("item_sim_matrix shape:", item_sim_matrix.shape)
item_sim_df.iloc[:5, :5]

item_sim_matrix shape: (9685, 9685)


Unnamed: 0,1,2,3,4,5
1,3454.5,896.5,430.75,13.75,406.5
2,896.5,1380.25,259.0,26.0,239.5
3,430.75,259.0,609.25,15.0,231.0
4,13.75,26.0,15.0,43.25,27.75
5,406.5,239.5,231.0,27.75,501.75


## 4. 추천 함수 구현 (유저 기준 / 영화 기준)

(1) 유저 기준 추천

In [11]:
def recommend_for_user_baseline(
    target_user_id: int,
    top_n: int = 10,
):
    """
    편향 보정 없는 baseline item-based CF:
    - user-item 행렬의 raw rating 사용
    - item-item dot product 유사도 사용
    - 이미 본 영화는 추천에서 제외
    반환값: (추천 결과 DataFrame, 이미 본 영화 DataFrame)
    """
    if target_user_id not in user_item_matrix.index:
        print(f"[WARN] user_id {target_user_id} 는 user_item_matrix에 없습니다.")
        return (
            pd.DataFrame(columns=["movie_id", "score", "title", "genres"]),
            pd.DataFrame(columns=["movie_id", "rating", "title", "genres"]),
        )
    
    # 1) 해당 유저의 평점 벡터 (index=movie_id)
    user_ratings = user_item_matrix.loc[target_user_id]
    
    # 2) numpy 벡터로 변환 (영화 순서 = item_ids와 맞추기)
    user_vector = user_ratings.reindex(item_ids, fill_value=0.0).to_numpy()
    
    # 3) dot-product 기반 점수 계산
    scores = np.dot(user_vector, item_sim_matrix)  # shape: (n_items,)
    
    # 4) 이미 본 영화는 추천에서 제외
    already_rated_mask = user_ratings.reindex(
        item_ids,
        fill_value=0.0,
    ).to_numpy() != 0.0
    
    scores_filtered = scores.copy()
    scores_filtered[already_rated_mask] = -np.inf
    
    # 5) 추천 후보 수 확인
    n_candidates = np.isfinite(scores_filtered).sum()
    if n_candidates == 0:
        print("[INFO] 추천할 수 있는 영화가 없습니다.")
        rec_df = pd.DataFrame(columns=["movie_id", "score", "title", "genres"])
    else:
        n_top = min(top_n, n_candidates)
        top_indices = np.argpartition(-scores_filtered, n_top - 1)[:n_top]
        top_indices = top_indices[np.argsort(-scores_filtered[top_indices])]
        
        top_movie_ids = item_ids[top_indices]
        top_scores = scores_filtered[top_indices]
        
        rec_df = (
            pd.DataFrame({"movie_id": top_movie_ids, "score": top_scores})
            .merge(movies_df, on="movie_id", how="left")
            .reset_index(drop=True)
        )
    
    # 6) 해당 유저가 이미 본 영화 리스트 (raw rating 기준)
    user_seen_df = (
        movie_ratings_df[movie_ratings_df["user_id"] == target_user_id]
        [["movie_id", "title", "rating"]]
        .sort_values("rating", ascending=False)
        .reset_index(drop=True)
    )
    
    print(f"=== [Baseline] user_id {target_user_id} 가 이미 평가한 영화들 (상위 20개) ===")
    display(user_seen_df.head(20))
    
    print(f"\n=== [Baseline] user_id {target_user_id} 에게 추천하는 영화 Top {top_n} ===")
    display(rec_df)
    
    return rec_df, user_seen_df

(2) 영화 기준 추천

In [12]:
def recommend_similar_movies_baseline(
    target_movie_id: int,
    top_n: int = 10,
    include_self: bool = False,
):
    """
    편향 보정 없는 baseline item-item CF:
    - item-item dot product 유사도 사용
    - 특정 movie_id와 비슷한 영화 추천
    """
    if target_movie_id not in item_ids:
        print(f"[WARN] movie_id {target_movie_id} 는 item_ids에 없습니다.")
        return pd.DataFrame(columns=["movie_id", "similarity", "title", "genres"])
    
    target_idx = np.where(item_ids == target_movie_id)[0][0]
    sims = item_sim_matrix[target_idx].copy()
    
    if not include_self:
        sims[target_idx] = -np.inf
    
    n_candidates = np.isfinite(sims).sum()
    if n_candidates == 0:
        print("[INFO] 유사한 영화를 찾을 수 없습니다.")
        return pd.DataFrame(columns=["movie_id", "similarity", "title", "genres"])
    
    n_top = min(top_n, n_candidates)
    top_indices = np.argpartition(-sims, n_top - 1)[:n_top]
    top_indices = top_indices[np.argsort(-sims[top_indices])]
    
    top_movie_ids = item_ids[top_indices]
    top_scores = sims[top_indices]
    
    sim_df = (
        pd.DataFrame({"movie_id": top_movie_ids, "similarity": top_scores})
        .merge(movies_df, on="movie_id", how="left")
        .reset_index(drop=True)
    )
    
    return sim_df

(3) 한 사용자가 본 영화 중 임의로 1편을 골라서 해당 영화와 유사한 영화 top n개를 추천

In [33]:
from IPython.display import display
import numpy as np
import pandas as pd

def recommend_from_random_watched_movie(
    target_user_id: int,
    top_n: int = 10,
    random_state: int | None = None,
):
    """
    한 사용자가 본 영화 목록을 먼저 보여주고,
    그 중 임의로 1편을 골라서 해당 영화와 유사한 영화 top_n개를 추천하는 함수.

    - movie_ratings_df: user_id, movie_id, title, rating 이 들어있는 DF
    - item_sim_matrix: movie_id 기준 item-item 유사도 (dot product)
    - item_ids: item_sim_matrix의 movie_id 순서
    - movies_df: movie_id, title, genres 등 메타 정보
    """

    # user_id를 int로 캐스팅 (이미 int면 그대로, string "1"이면 1로)
    try:
        uid = int(target_user_id)
    except ValueError:
        uid = target_user_id  # 혹시 정수 변환이 안 되면 그대로 사용

    # 1) 해당 유저가 본 영화 목록 가져오기
    user_movies = movie_ratings_df[movie_ratings_df["user_id"] == uid].copy()

    if user_movies.empty:
        print(f"[WARN] user_id {target_user_id} 에 대한 시청 기록이 없습니다.")
        return None, pd.DataFrame(), pd.DataFrame()

    # 필요하면 movies_df에서 장르 정보 붙이기
    if "genres" not in user_movies.columns:
        user_movies = user_movies.merge(
            movies_df[["movie_id", "genres"]],
            on="movie_id",
            how="left",
        )

    # 평점 순으로 정렬해서 보기 좋게
    user_movies = user_movies.sort_values("rating", ascending=False)

    print(f"=== user_id {uid} 가 본 영화 목록 (평점 순, 상위 20개) ===")
    display(user_movies[["movie_id", "title", "genres", "rating"]].head(20))

    # 2) 그 중 임의의 영화 1편 선택
    if random_state is not None:
        chosen_row = user_movies.sample(n=1, random_state=random_state).iloc[0]
    else:
        chosen_row = user_movies.sample(n=1).iloc[0]

    # → if/else 밖에서 공통 처리
    chosen_movie_id = int(chosen_row["movie_id"])
    chosen_title = chosen_row["title"]
    chosen_genres = chosen_row["genres"]
    chosen_rating = chosen_row["rating"]

    print("\n=== 기준이 되는 영화 (랜덤 선택) ===")
    display(
        chosen_row[["movie_id", "title", "genres", "rating"]]
        .to_frame()
        .T
        .reset_index(drop=True)
    )


    # 3) 이 영화와 유사한 영화 top_n 추천
    if chosen_movie_id not in item_ids:
        print("[WARN] 선택된 영화가 item_ids에 없습니다. 유사도 계산 불가.")
        return chosen_row, user_movies, pd.DataFrame()

    # similarity vector 꺼내기
    idx = np.where(item_ids == chosen_movie_id)[0][0]
    sims = item_sim_matrix[idx].copy()

    # 자기 자신은 제외
    sims[idx] = -np.inf

    # 유효 후보 개수
    n_candidates = np.isfinite(sims).sum()
    if n_candidates == 0:
        print("[INFO] 유사한 영화를 찾을 수 없습니다.")
        return chosen_row, user_movies, pd.DataFrame()

    n_top = min(top_n, n_candidates)

    # 상위 top_n index 뽑기
    top_idx = np.argpartition(-sims, n_top - 1)[:n_top]
    top_idx = top_idx[np.argsort(-sims[top_idx])]

    rec_movie_ids = item_ids[top_idx]
    rec_scores = sims[top_idx]

    # 영화 메타데이터 붙이기
    rec_df = (
        pd.DataFrame({"movie_id": rec_movie_ids, "similarity": rec_scores})
        .merge(movies_df, on="movie_id", how="left")
        [["movie_id", "title", "genres", "similarity"]]
        .reset_index(drop=True)
    )

    print(f"\n=== '{chosen_title}' 와 유사한 영화 Top {top_n} ===")
    display(rec_df)

    # 기준 영화 1편 정보, 해당 유저의 전체 시청 목록, 추천 결과를 모두 반환
    return chosen_row, user_movies, rec_df


In [34]:
# === (선택) 예시 실행 ===
# recommend_for_user_baseline(1, top_n=10)
# recommend_similar_movies_baseline(1, top_n=10)
recommend_from_random_watched_movie(1, top_n=10, random_state=42)

=== user_id 1 가 본 영화 목록 (평점 순, 상위 20개) ===


Unnamed: 0,movie_id,title,genres,rating
116,1049,"Ghost and the Darkness, The (1996)",Action Adventure,5.0
106,3441,Red Dawn (1984),Action Drama War,5.0
168,1089,Reservoir Dogs (1992),Crime Mystery Thriller,5.0
169,1617,L.A. Confidential (1997),Crime Film-Noir Mystery Thriller,5.0
101,1275,Highlander (1986),Action Adventure Fantasy,5.0
100,1573,Face/Off (1997),Action Crime Drama Thriller,5.0
172,2580,Go (1999),Comedy Crime,5.0
174,1804,"Newton Boys, The (1998)",Crime Drama,5.0
97,2991,Live and Let Die (1973),Action Adventure Thriller,5.0
94,2529,Planet of the Apes (1968),Action Drama Sci-Fi,5.0



=== 기준이 되는 영화 (랜덤 선택) ===


Unnamed: 0,movie_id,title,genres,rating
0,1030,Pete's Dragon (1977),Adventure Animation Children Musical,3.0



=== 'Pete's Dragon (1977)' 와 유사한 영화 Top 10 ===


Unnamed: 0,movie_id,title,genres,similarity
0,260,Star Wars: Episode IV - A New Hope (1977),Action Adventure Sci-Fi,163.0
1,919,"Wizard of Oz, The (1939)",Adventure Children Fantasy Musical,157.0
2,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action Adventure,156.0
3,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action Adventure Sci-Fi,154.0
4,1270,Back to the Future (1985),Adventure Comedy Sci-Fi,149.5
5,2987,Who Framed Roger Rabbit? (1988),Adventure Animation Children Comedy Crime Fant...,146.0
6,1196,Star Wars: Episode V - The Empire Strikes Back...,Action Adventure Sci-Fi,145.5
7,1097,E.T. the Extra-Terrestrial (1982),Children Drama Sci-Fi,145.0
8,1073,Willy Wonka & the Chocolate Factory (1971),Children Comedy Fantasy Musical,141.5
9,356,Forrest Gump (1994),Comedy Drama Romance War,139.75


(user_id                                         1
 movie_id                                     1030
 rating                                        3.0
 timestamp                               964982903
 title                        Pete's Dragon (1977)
 genres       Adventure Animation Children Musical
 Name: 162, dtype: object,
      user_id  movie_id  rating  timestamp  \
 116        1      1049     5.0  964982400   
 106        1      3441     5.0  964982328   
 168        1      1089     5.0  964982951   
 169        1      1617     5.0  964982951   
 101        1      1275     5.0  964982290   
 ..       ...       ...     ...        ...   
 63         1      2253     2.0  964981775   
 202        1      2338     2.0  964983546   
 189        1      1219     2.0  964983393   
 179        1      2389     2.0  964983094   
 199        1      3176     1.0  964983504   
 
                                             title  \
 116            Ghost and the Darkness, The (1996)   
 106 

## 5. 예측 함수 (Baseline item-based CF, raw rating)

In [15]:
def predict_rating_item_based_baseline(user_id: int, movie_id: int) -> float:
    """
    편향 보정 없는 baseline item-based CF 예측 함수.
    - 전체 user_item_matrix / item_sim_matrix 사용
    - item-item dot product 유사도 사용
    - 예측 불가한 경우 np.nan 반환
    """
    # 데이터에 없는 유저/영화면 예측 불가
    if user_id not in user_item_matrix.index:
        return np.nan
    if movie_id not in item_ids:
        return np.nan
    
    # 해당 유저의 rating 벡터
    user_vector = user_item_matrix.loc[user_id]  # Series (index=movie_id)
    
    # target 영화의 similarity 벡터
    target_idx = np.where(item_ids == movie_id)[0][0]
    sims = item_sim_matrix[target_idx]  # shape: (n_items,)
    
    # 유저가 실제로 본 영화들만 사용 (rating != 0)
    rated_mask = (user_vector != 0.0).to_numpy()
    sims_rated = sims[rated_mask]
    ratings_rated = user_vector.to_numpy()[rated_mask]
    
    if len(ratings_rated) == 0:
        return np.nan
    
    denom = np.sum(np.abs(sims_rated))
    if denom == 0:
        return np.nan
    
    pred_rating = np.sum(sims_rated * ratings_rated) / denom
    
    # 평점 범위 (MovieLens 기준 0.5~5.0 가정)로 클리핑
    pred_rating = max(0.5, min(5.0, pred_rating))
    
    return float(pred_rating)

## 6. 예측 정확도 평가용 테스트 데이터 추출

In [16]:
from sklearn.model_selection import train_test_split

# 테스트용 샘플 데이터 추출
#  - movie_ratings_df에서 user_id, movie_id, title, rating만 사용
_, test_data = train_test_split(
    movie_ratings_df[["user_id", "movie_id", "title", "rating"]],
    test_size=20000,
    random_state=1234,
    stratify=movie_ratings_df["user_id"],
)

print("test_data shape:", test_data.shape)
display(test_data.head())

test_data shape: (20000, 4)


Unnamed: 0,user_id,movie_id,title,rating
62316,464,7481,Enemy Mine (1985),4.5
60208,590,4447,Legally Blonde (2001),3.0
24643,135,1653,Gattaca (1997),5.0
30554,288,2054,"Honey, I Shrunk the Kids (1989)",3.5
99865,258,4995,"Beautiful Mind, A (2001)",5.0


## 7. 테스트 데이터에 대한 예측 평점 계산 및 조인

In [17]:
# 테스트 데이터의 각 (user_id, movie_id)에 대해 예측 평점 계산
pred_rows = []

for row in test_data.itertuples():
    u = row.user_id
    m = row.movie_id
    t = row.title
    
    pred = predict_rating_item_based_baseline(u, m)
    pred_rows.append((u, t, pred))

all_pred_df = pd.DataFrame(pred_rows, columns=["user_id", "title", "pred_rating"])

print("all_pred_df shape:", all_pred_df.shape)
display(all_pred_df.head())

# 예측 평점 조인
test_data = pd.merge(test_data, all_pred_df, on=["user_id", "title"], how="left")
display(test_data.head())

all_pred_df shape: (20000, 3)


Unnamed: 0,user_id,title,pred_rating
0,464,Enemy Mine (1985),4.08398
1,590,Legally Blonde (2001),3.498596
2,135,Gattaca (1997),3.896117
3,288,"Honey, I Shrunk the Kids (1989)",3.451772
4,258,"Beautiful Mind, A (2001)",4.620966


Unnamed: 0,user_id,movie_id,title,rating,pred_rating
0,464,7481,Enemy Mine (1985),4.5,4.08398
1,590,4447,Legally Blonde (2001),3.0,3.498596
2,135,1653,Gattaca (1997),5.0,3.896117
3,288,2054,"Honey, I Shrunk the Kids (1989)",3.5,3.451772
4,258,4995,"Beautiful Mind, A (2001)",5.0,4.620966


## 8. MAE / MSE / RMSE 성능 평가

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 실제 평점과 예측 평점 배열로 변환
true_y = np.array(test_data["rating"])
pred_y = np.array(test_data["pred_rating"])

# MAE, MSE, RMSE 계산
mae = mean_absolute_error(y_true=true_y, y_pred=pred_y)
mse = mean_squared_error(y_true=true_y, y_pred=pred_y)
rmse = np.sqrt(mse)

print(f"MAE  : {round(mae, 2)}")
print(f"MSE  : {round(mse, 2)}")
print(f"RMSE : {round(rmse, 2)}")

MAE  : 0.7
MSE  : 0.84
RMSE : 0.92
