<a href="https://colab.research.google.com/github/unie12/recommendation/blob/main/hybrid_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CF 모델 로드 및 결과 반환


In [None]:
import joblib
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

# cf 모델에서 사용한 CF
class CollaborativeFiltering:
    def __init__(self, ratings_df_filtered, hyper_params):
        self.R = ratings_df_filtered
        self.num_users = len(np.unique(self.R.row))
        self.num_items = len(np.unique(self.R.col))
        self.K = hyper_params['K']
        self.alpha = hyper_params['alpha']
        self.beta = hyper_params['beta']
        self.iterations = hyper_params['iterations']
        self.verbose = hyper_params['verbose']
        self.batch_size = hyper_params['batch_size']
        self.b = np.mean(self.R.data)

    def predict_for_user(self, user_id):
        if user_id in self.user_id_map:
            user_idx = self.user_id_map[user_id]
            return self.b + self.b_u[user_idx] + self.b_d + np.dot(self.P[user_idx], self.Q.T)
        else:
            return np.full(self.num_items, self.b)

    def __getstate__(self):
        state = self.__dict__.copy()
        if 'R' in state:
            del state['R']
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        self.R = None

def load_recommender(model_path='/content/cf_model_v2.pkl'):
    """저장된 모델과 데이터를 로드합니다."""
    data = joblib.load(model_path)
    return data['model'], data['user_map'], data['movie_map'], data['movies_df']

def get_movie_recommendations(user_id, cf_model, user_map, movies_df, top_n=10):
    """특정 사용자에 대한 영화 추천을 반환합니다."""
    try:
        # 사용자 ID를 인덱스로 변환 -> ratings_df와 cf_model에서의 userId가 다름
        if user_id not in user_map:
            return {'error': f"사용자 ID {user_id}가 학습 데이터에 없습니다."}

        user_index = user_map[user_id]

        # 사용자 예측 평점 계산
        user_predictions = cf_model.predict_for_user(user_index)

        # 상위 N개 영화 선택
        top_movie_indices = np.argsort(user_predictions)[::-1][:top_n]
        top_movie_ids = [cf_model.idx_item_map[idx] for idx in top_movie_indices]

        # 추천 영화 정보 가져오기
        recommended_movies = movies_df[movies_df['movieId'].isin(top_movie_ids)]

        # 결과 포맷팅
        recommendations = []
        for idx, movie in recommended_movies.iterrows():
            movie_id = movie['movieId']
            predicted_rating = user_predictions[cf_model.item_id_map[movie_id]]
            recommendations.append({
                'title': movie['title'],
                'genres': movie['genres'],
                'predicted_rating': float(predicted_rating)
            })

        return recommendations

    except Exception as e:
        return {'error': f"추천 생성 중 오류 발생: {str(e)}"}

컨텐츠 기반 추천 모델 로드 및 결과 반환

In [None]:
!pip install annoy==1.17.1

In [None]:
import os
import pickle
from annoy import AnnoyIndex

def load_content_model(model_dir='content_model'):
    """
    저장된 모델과 데이터를 로드하는 함수
    """
    # 모델 데이터 로드
    with open(os.path.join(model_dir, 'content_model_data.pkl'), 'rb') as f:
        model_data = pickle.load(f)

    # Annoy 인덱스 로드
    vector_dimension = 768  # BERT embedding dimension
    annoy_index = AnnoyIndex(vector_dimension, 'angular')
    annoy_index.load(os.path.join(model_dir, 'movie_similarity.ann'))

    return annoy_index, model_data

def recommend_from_loaded_model(movie_list, model_data, annoy_index, n_recommendations=4):
    """
    로드된 모델을 사용하여 영화 추천을 수행하는 함수
    """
    title_to_index = model_data['title_to_index']
    index_to_title = model_data['index_to_title']
    recommendations = {}

    for movie in movie_list:
        try:
            movie_idx = title_to_index[movie]
            similar_indices = annoy_index.get_nns_by_item(movie_idx, n_recommendations + 1)

            similar_indices = [idx for idx in similar_indices
                             if idx != movie_idx and index_to_title[idx] not in movie_list]

            for idx in similar_indices[:n_recommendations]:
                similar_movie = index_to_title[idx]
                if similar_movie not in recommendations:
                    similarity = 1 - annoy_index.get_distance(movie_idx, idx)
                    recommendations[similar_movie] = {
                        'score': similarity,
                        'based_on': movie
                    }
                else:
                    new_similarity = 1 - annoy_index.get_distance(movie_idx, idx)
                    if new_similarity > recommendations[similar_movie]['score']:
                        recommendations[similar_movie] = {
                            'score': new_similarity,
                            'based_on': movie
                        }

        except KeyError:
            print(f"Movie '{movie}' not found in the database.")

    sorted_recommendations = sorted(
        recommendations.items(),
        key=lambda x: x[1]['score'],
        reverse=True
    )

    # print("\nTop Recommendations based on your movie list:")
    # print("=" * 80)
    # for movie, data in sorted_recommendations:
        # print(f"{movie:<50} (Score: {data['score']:.3f}, Based on: {data['based_on']})")

    return sorted_recommendations

In [None]:
loaded_annoy_index, loaded_model_data = load_content_model()

# 영화 리스트로 추천받기
favorite_movies = [
    "Dark Knight, The (2008)",
    "Inception (2010)",
    "Mother (Madeo) (2009)",
    "There Will Be Blood (2007)",
    "Thirst (Bakjwi) (2009)"
]

recommendations = recommend_from_loaded_model(
    movie_list=favorite_movies,
    model_data=loaded_model_data,
    annoy_index=loaded_annoy_index
)

1. cf + contents based model의 각 결과값에 가중치를 두어 결과 도출
  두 추천 모델의 추천 영화들이 아예 다르므로 의미가 없음

cf 추천 결과에 대해 컨텐츠 기반 추천을 돌려보고
컨텐츠 기반 추천 결과에 cf 모델을 돌려서
둘이 합치는 과정을 해봤지만 유효하지 않음

In [None]:
class HybridRecommender:
    def __init__(self, cf_weight=0.4, content_weight=0.6):
        self.cf_weight = cf_weight
        self.content_weight = content_weight
        self.cf_model = None
        self.user_map = None
        self.movie_map = None
        self.movies_df = None
        self.annoy_index = None
        self.content_model_data = None

    def load_models(self, cf_model_path='/content/model_data/cf_model_v2.pkl',
                   content_model_dir='/content/content_model'):
        # CF 모델 로드
        cf_data = joblib.load(cf_model_path)
        self.cf_model = cf_data['model']
        self.user_map = cf_data['user_map']
        self.movie_map = cf_data['movie_map']
        self.movies_df = cf_data['movies_df']

        # 컨텐츠 기반 모델 로드
        self.annoy_index = AnnoyIndex(768, 'angular')
        self.annoy_index.load(os.path.join(content_model_dir, 'movie_similarity.ann'))

        with open(os.path.join(content_model_dir, 'content_model_data.pkl'), 'rb') as f:
            self.content_model_data = pickle.load(f)

    def get_recommendations(self, user_id, favorite_movies, n_recommendations=10):
        # 1. 컨텐츠 기반 추천 획득
        content_recs = recommend_from_loaded_model(
            movie_list=favorite_movies,
            model_data=self.content_model_data,
            annoy_index=self.annoy_index,
            n_recommendations=n_recommendations * 2
        )

        # 2. CF 추천 획득
        cf_recs = get_movie_recommendations(
            user_id,
            self.cf_model,
            self.user_map,
            self.movies_df,
            top_n=n_recommendations * 2
        )

        # 3. 모든 후보 영화에 대해 두 시스템의 점수를 모두 계산
        candidates = {}

        # CF 추천 영화들 처리
        cf_scores = {rec['title']: rec['predicted_rating'] for rec in cf_recs}
        max_cf = max(cf_scores.values())
        min_cf = min(cf_scores.values())

        # 컨텐츠 추천 영화들 처리
        content_scores = {movie: data['score'] for movie, data in content_recs}
        max_content = max(content_scores.values())
        min_content = min(content_scores.values())

        # 모든 후보 영화들에 대해 최종 점수 계산
        all_movies = set(cf_scores.keys()) | set(content_scores.keys())
        for movie in all_movies:
            # CF 점수 정규화 (없으면 0)
            cf_score = cf_scores.get(movie, min_cf)
            norm_cf = (cf_score - min_cf) / (max_cf - min_cf) if movie in cf_scores else 0

            # 컨텐츠 점수 정규화 (없으면 0)
            content_score = content_scores.get(movie, min_content)
            norm_content = (content_score - min_content) / (max_content - min_content) if movie in content_scores else 0

            # 최종 점수 계산 (가중 평균)
            final_score = (norm_cf * self.cf_weight + norm_content * self.content_weight)

            candidates[movie] = {
                'final_score': final_score,
                'cf_score': cf_score if movie in cf_scores else 0,
                'content_score': content_score if movie in content_scores else 0,
                'norm_cf': norm_cf,
                'norm_content': norm_content,
                'genres': self.movies_df[self.movies_df['title'] == movie]['genres'].iloc[0] if movie in self.movies_df['title'].values else ''
            }

        # 최종 점수로 정렬
        sorted_recs = sorted(candidates.items(), key=lambda x: x[1]['final_score'], reverse=True)

        print("\n=== 추천 영화 및 점수 상세 ===")
        for movie, data in sorted_recs[:n_recommendations]:
            print(f"\n영화: {movie}")
            print(f"최종 점수: {data['final_score']:.3f}")
            print(f"CF 기여도: {data['norm_cf']:.3f} (원본: {data['cf_score']:.3f})")
            print(f"컨텐츠 기여도: {data['norm_content']:.3f} (원본: {data['content_score']:.3f})")
            print(f"장르: {data['genres']}")
            print("-" * 50)

        return sorted_recs[:n_recommendations]

    def _combine_recommendations(self, cf_recs, content_recs, n_recommendations):
        all_movies = {}

        # CF 점수 정규화를 위한 최대/최소값 계산
        cf_scores = [rec['predicted_rating'] for rec in cf_recs]
        min_cf = min(cf_scores)
        max_cf = max(cf_scores)

        # 컨텐츠 점수 정규화를 위한 최대/최소값 계산
        content_scores = [data['score'] for movie, data in content_recs]
        min_content = min(content_scores)
        max_content = max(content_scores)

        # 모든 영화에 대해 CF 점수 계산 및 저장
        for rec in cf_recs:
            movie_title = rec['title']
            normalized_cf_score = (rec['predicted_rating'] - min_cf) / (max_cf - min_cf)
            all_movies[movie_title] = {
                'cf_score': normalized_cf_score,
                'content_score': 0,  # 기본값 설정
                'genres': rec['genres'],
                'original_cf_score': rec['predicted_rating']
            }

        # 모든 영화에 대해 컨텐츠 점수 계산 및 저장
        for movie, data in content_recs:
            normalized_content_score = (data['score'] - min_content) / (max_content - min_content)
            if movie in all_movies:
                all_movies[movie]['content_score'] = normalized_content_score
                all_movies[movie]['original_content_score'] = data['score']
            else:
                all_movies[movie] = {
                    'cf_score': 0,  # CF 점수가 없는 경우
                    'content_score': normalized_content_score,
                    'genres': data.get('genres', ''),
                    'original_content_score': data['score']
                }

        # 최종 점수 계산 (항상 두 가중치를 모두 사용)
        for movie, data in all_movies.items():
            data['final_score'] = (data['cf_score'] * self.cf_weight +
                                  data['content_score'] * self.content_weight)

        # 최종 점수로 정렬
        sorted_recommendations = sorted(
            [(movie, data) for movie, data in all_movies.items()],
            key=lambda x: x[1]['final_score'],
            reverse=True
        )

        print("\n=== 최종 추천 순위 ===")
        for i, (movie, data) in enumerate(sorted_recommendations[:n_recommendations], 1):
            print(f"\n{i}. {movie}")
            print(f"최종 점수: {data['final_score']:.3f}")
            print(f"정규화된 CF 점수: {data['cf_score']:.3f} (가중치: {self.cf_weight})")
            print(f"정규화된 컨텐츠 점수: {data['content_score']:.3f} (가중치: {self.content_weight})")
            if 'original_cf_score' in data and data['original_cf_score'] > 0:
                print(f"원본 CF 점수: {data['original_cf_score']:.3f}")
            if 'original_content_score' in data and data['original_content_score'] > 0:
                print(f"원본 컨텐츠 점수: {data['original_content_score']:.3f}")

        return sorted_recommendations[:n_recommendations]


def __getstate__(self):
    """pickle을 위한 상태 저장"""
    state = self.__dict__.copy()
    # annoy_index 제거 (C++ 객체라 직렬화 불가)
    if 'annoy_index' in state:
        del state['annoy_index']
    return state

def __setstate__(self, state):
    """pickle을 위한 상태 복원"""
    self.__dict__.update(state)
    # annoy_index 재로드
    self.load_annoy_index()

def save_hybrid_model(hybrid_model, model_dir='/content/hybrid_model'):
    """하이브리드 모델 저장"""
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, 'hybrid_model.pkl')

    try:
        with open(model_path, 'wb') as f:
            pickle.dump(hybrid_model, f)
        print(f"Hybrid model saved in {model_path}")
    except Exception as e:
        print(f"Error saving hybrid model: {str(e)}")


In [None]:
# 하이브리드 추천 시스템 초기화 및 모델 로드
recommender = HybridRecommender(cf_weight=0.5, content_weight=0.5)
save_hybrid_model(recommender)

recommender.load_models()

# 사용자 선호 영화 리스트
favorite_movies = [
    "Dark Knight, The (2008)",
    "Inception (2010)",
    "Mother (Madeo) (2009)",
    "There Will Be Blood (2007)",
    "Thirst (Bakjwi) (2009)"
]

# 추천 받기 (CF를 위한 user_id와 컨텐츠 기반을 위한 favorite_movies 모두 전달)
recommendations = recommender.get_recommendations(
    user_id=140000,
    favorite_movies=favorite_movies
)

# 결과 출력
print("\n하이브리드 추천 결과:")
print("=" * 80)
for movie, data in recommendations[:20]:
    print(f"영화: {movie}")
    print(f"최종 점수: {data['final_score']:.3f}")
    print(f"CF 기여도: {data['norm_cf']:.3f} (원본: {data['cf_score']:.3f})")
    print(f"컨텐츠 기여도: {data['norm_content']:.3f} (원본: {data['content_score']:.3f})")
    print(f"장르: {data['genres']}")
    print("-" * 50)

모델 로드해서 사용해보기

In [None]:
import pickle
import joblib
from annoy import AnnoyIndex
import os

import pickle
import joblib
from annoy import AnnoyIndex
import os

def load_hybrid_recommender(model_dir='/content/hybrid_model'):
    """하이브리드 추천 모델을 로드하는 함수"""
    model_path = os.path.join(model_dir, 'hybrid_model.pkl')

    try:
        # 모델 로드 - [1] 인덱싱 제거
        with open(model_path, 'rb') as f:
            hybrid_model = pickle.load(f)

        # 모델의 필수 컴포넌트들 로드
        hybrid_model.load_models()

        return hybrid_model

    except Exception as e:
        print(f"Error loading hybrid model: {str(e)}")
        return None

if __name__ == "__main__":
    # 모델 로드
    recommender = load_hybrid_recommender()

    if recommender:
        try:
            # 추천 받기
            recommendations = recommender.get_recommendations(
                user_id=140000,
                favorite_movies=["The Dark Knight", "Inception"],
                n_recommendations=5
            )

            # 추천 결과 출력
            for movie, data in recommendations:
                print(f"Movie: {movie}")
                print(f"Score: {data['final_score']:.3f}")
                print("-" * 30)

        except Exception as e:
            print(f"Error getting recommendations: {str(e)}")

Movie 'The Dark Knight' not found in the database.
Movie 'Inception' not found in the database.
Error getting recommendations: max() arg is an empty sequence


In [None]:
# 모델 로드 및 사용
recommender = load_hybrid_recommender()

if recommender:
    recommendations = recommender.get_recommendations(
        user_id=140000,
        favorite_movies=[
            "Dark Knight, The (2008)",
            "Inception (2010)",
            "Mother (Madeo) (2009)",
            "There Will Be Blood (2007)",
            "Thirst (Bakjwi) (2009)"
        ],
        n_recommendations=5
    )

    for movie, data in recommendations:
        print(f"Movie: {movie}")
        print(f"Score: {data['final_score']:.3f}")
        print("-" * 30)


=== 추천 영화 및 점수 상세 ===

영화: Arrival, The (1996)
최종 점수: 0.500
CF 기여도: 1.000 (원본: 4.602)
컨텐츠 기여도: 0.000 (원본: 0.000)
장르: Action|Sci-Fi|Thriller
--------------------------------------------------

영화: Dark Knight Rises, The (2012)
최종 점수: 0.500
CF 기여도: 0.000 (원본: 0.000)
컨텐츠 기여도: 1.000 (원본: 0.852)
장르: Action|Adventure|Crime|IMAX
--------------------------------------------------

영화: Memories of Murder (Salinui chueok) (2003)
최종 점수: 0.397
CF 기여도: 0.000 (원본: 0.000)
컨텐츠 기여도: 0.794 (원본: 0.821)
장르: Crime|Drama|Mystery|Thriller
--------------------------------------------------

영화: Confessions (Kokuhaku) (2010)
최종 점수: 0.391
CF 기여도: 0.000 (원본: 0.000)
컨텐츠 기여도: 0.781 (원본: 0.819)
장르: Drama|Horror
--------------------------------------------------

영화: Batman Begins (2005)
최종 점수: 0.390
CF 기여도: 0.000 (원본: 0.000)
컨텐츠 기여도: 0.780 (원본: 0.819)
장르: Action|Crime|IMAX
--------------------------------------------------
Movie: Arrival, The (1996)
Score: 0.500
------------------------------
Movie: Dark Knight Ri