<a href="https://colab.research.google.com/github/unie12/recommendation/blob/main/hybrid_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install annoy==1.17.1

Collecting annoy==1.17.1
  Downloading annoy-1.17.1.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/648.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/648.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.0/648.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.1-cp310-cp310-linux_x86_64.whl size=551243 sha256=45815c4bf0bb164a64b17b4820771226a469783642734cd4760654cca71fc8a0
  Stored in directory: /root/.cache/pip/wheels/8a/e1/f6/cd65e222d475c5ade306a766fb34c8da2f3f96c01de2a6602a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.1


In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import torch
from sklearn.preprocessing import MinMaxScaler
import os
import pickle
from annoy import AnnoyIndex

class HybridRecommender:
    def __init__(self, content_model_data, cf_model_data=None, content_weight=0.5):
        self.content_weight = content_weight
        self.cf_weight = 1 - content_weight
        self.content_model_data = content_model_data

        # CF 모델 데이터가 있는 경우
        if cf_model_data is not None:
            self.cf_model = cf_model_data['model']
            self.movies_df = cf_model_data['movies_df']
            self.user_map = cf_model_data['user_map']
            self.movie_map = cf_model_data['movie_map']
        else:
            self.cf_model = None

        self.annoy_index = None
        self.load_annoy_index()

        # 디버깅을 위한 데이터 확인
        print("\nInitialization Debug Info:")
        print(f"Number of movies in movies_df: {len(self.movies_df) if hasattr(self, 'movies_df') else 'No movies_df'}")
        print(f"Number of items in title_to_index: {len(self.content_model_data['title_to_index'])}")
        print(f"Sample movie titles in title_to_index: {list(self.content_model_data['title_to_index'].keys())[:3]}")


    def load_annoy_index(self):
        """Annoy 인덱스 로드"""
        try:
            vector_dimension = 768  # BERT base의 임베딩 차원
            self.annoy_index = AnnoyIndex(vector_dimension, 'angular')
            self.annoy_index.load(self.content_model_data['annoy_path'])
            print("Annoy index loaded successfully")
        except Exception as e:
            print(f"Error loading Annoy index: {str(e)}")
            raise

    def get_content_features(self, movie_id):
        """컨텐츠 기반 특성 추출"""
        try:
            # 영화의 BERT 임베딩 가져오기
            movie_title = self.movies_df[self.movies_df['movieId'] == movie_id]['title'].iloc[0]
            idx = self.content_model_data['title_to_index'][movie_title]
            embedding = self.annoy_index.get_item_vector(idx)
            return np.array(embedding)
        except (KeyError, IndexError):
            return np.zeros(768)  # BERT base 차원에 맞춤

    def get_cf_features(self, movie_id):
        """협업 필터링 특성 추출"""
        try:
            item_idx = self.cf_model.item_id_map[movie_id]
            # 잠재 요인과 편향 결합
            cf_features = np.concatenate([
                self.cf_model.Q[item_idx],  # 아이템 잠재 요인
                [self.cf_model.b_d[item_idx]]  # 아이템 편향
            ])
            return cf_features
        except KeyError:
            return np.zeros(self.cf_model.K + 1)

    def create_hybrid_features(self, movie_id):
        """하이브리드 특성 생성"""
        # 컨텐츠 기반 특성과 CF 특성 추출
        content_features = self.get_content_features(movie_id)
        cf_features = self.get_cf_features(movie_id)

        # 특성 정규화
        content_features = self._normalize_features(content_features)
        cf_features = self._normalize_features(cf_features)

        # 가중치 적용
        weighted_content = content_features * self.content_weight
        weighted_cf = cf_features * self.cf_weight

        # 특성 결합
        hybrid_features = np.concatenate([
            weighted_content,  # 가중치가 적용된 컨텐츠 특성
            weighted_cf,      # 가중치가 적용된 CF 특성
            [np.dot(weighted_content, weighted_content.T)],  # 컨텐츠 유사도
            [np.dot(weighted_cf, weighted_cf.T)]            # CF 유사도
        ])

        return hybrid_features

    def _normalize_features(self, features):
        """특성 정규화"""
        norm = np.linalg.norm(features)
        return features / norm if norm != 0 else features

    def build_similarity_matrix(self, movie_ids):
        """영화 간 유사도 행렬 생성"""
        n_movies = len(movie_ids)
        similarity_matrix = np.zeros((n_movies, n_movies))

        for i, movie1 in enumerate(movie_ids):
            features1 = self.create_hybrid_features(movie1)
            for j, movie2 in enumerate(movie_ids[i:], i):
                features2 = self.create_hybrid_features(movie2)
                similarity = self._calculate_similarity(features1, features2)
                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity

        return similarity_matrix

    def _calculate_similarity(self, features1, features2):
        """코사인 유사도 계산"""
        return np.dot(features1, features2) / (np.linalg.norm(features1) * np.linalg.norm(features2))

    def recommend_by_movie(self, movie_title, n_recommendations=10):
        """영화 제목만으로 추천"""
        print(f"\nDebug - recommend_by_movie:")
        print(f"Input movie title: {movie_title}")
        print(f"Available titles: {list(self.content_model_data['title_to_index'].keys())[:5]}")

        try:
            # 입력 영화의 특성 추출
            if movie_title not in self.content_model_data['title_to_index']:
                print(f"Movie title not found in title_to_index: {movie_title}")
                return []

            idx = self.content_model_data['title_to_index'][movie_title]
            print(f"Found movie index: {idx}")

            query_features = np.array(self.annoy_index.get_item_vector(idx))
            print(f"Successfully extracted features for movie")

            # Annoy를 사용한 유사 영화 검색
            similar_indices = self.annoy_index.get_nns_by_vector(
                query_features, n_recommendations + 1
            )
            print(f"Found similar indices: {similar_indices}")

            recommendations = []
            for idx in similar_indices:
                similar_title = self.content_model_data['index_to_title'][idx]
                if similar_title != movie_title:
                    movie_info = next(
                        (m for m in self.content_model_data['movie_data']
                         if m['title'] == similar_title),
                        None
                    )

                    if movie_info:
                        similarity = 1 - self.annoy_index.get_distance(
                            self.content_model_data['title_to_index'][movie_title],
                            idx
                        )

                        recommendations.append({
                            'movieId': movie_info['movieId'],
                            'title': movie_info['title'],
                            'genres': movie_info['genres'],
                            'similarity': similarity
                        })

            print(f"Generated {len(recommendations)} recommendations")
            return recommendations[:n_recommendations]

        except Exception as e:
            print(f"Error in recommend_by_movie: {str(e)}")
            return []

    def recommend(self, user_id=None, movie_title=None, n_recommendations=10):
        """사용자 ID 또는 영화 제목으로 추천"""
        if movie_title:
            return self.recommend_by_movie(movie_title, n_recommendations)
        elif user_id and self.cf_model:
            return self.recommend_by_user(user_id, n_recommendations)
        else:
            raise ValueError("Either user_id or movie_title must be provided")


    def recommend_by_user(self, user_id, n_recommendations=10):
        """하이브리드 추천 생성"""
        print(f"\nDebug - recommend_by_user:")
        print(f"Input user_id: {user_id}")

        try:
            if user_id not in self.user_map:
                print(f"User ID {user_id} not found in user_map")
                return []

            mapped_user_id = self.user_map[user_id]
            print(f"Mapped user_id: {mapped_user_id}")

            user_watched_movies = self.movies_df[self.movies_df['movieId'].isin(
                self.cf_model.R[self.cf_model.user_id_map[mapped_user_id]].indices
            )]['movieId'].unique()

            print(f"Found {len(user_watched_movies)} movies watched by user")

            if len(user_watched_movies) == 0:
                print("No watched movies found for user")
                return []

            # 모든 영화에 대한 하이브리드 특성 생성
            all_movies = self.movies_df['movieId'].unique()
            print(f"Total number of movies to process: {len(all_movies)}")

            all_features = {}
            for movie_id in all_movies:
                all_features[movie_id] = self.create_hybrid_features(movie_id)

            # 사용자 프로필 생성
            user_profile = np.zeros_like(list(all_features.values())[0])
            for movie_id in user_watched_movies:
                if movie_id in all_features:
                    user_profile += all_features[movie_id]
            user_profile = self._normalize_features(user_profile)

            # 추천 점수 계산
            scores = {}
            for movie_id in all_movies:
                if movie_id not in user_watched_movies and movie_id in all_features:
                    similarity = self._calculate_similarity(user_profile, all_features[movie_id])
                    scores[movie_id] = similarity

            print(f"Generated scores for {len(scores)} movies")

            # 상위 N개 영화 선택
            top_movies = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]

            # 결과 포맷팅
            recommendations = []
            for movie_id, score in top_movies:
                movie_info = self.movies_df[self.movies_df['movieId'] == movie_id].iloc[0]
                recommendations.append({
                    'movieId': movie_id,
                    'title': movie_info['title'],
                    'score': score,
                    'genres': movie_info['genres']
                })

            print(f"Generated {len(recommendations)} recommendations")
            return recommendations

        except Exception as e:
            print(f"Error in recommend_by_user: {str(e)}")
            import traceback
            print(traceback.format_exc())
            return []

In [None]:
def print_recommendations(recommendations, title="추천 결과"):
    """추천 결과를 보기 좋게 출력하는 함수"""
    print(f"\n{'='*20} {title} {'='*20}")
    for i, rec in enumerate(recommendations, 1):
        print(f"\n{i}. {rec['title']}")
        print(f"   장르: {rec['genres']}")
        if 'similarity' in rec:
            print(f"   유사도: {rec['similarity']:.3f}")
        elif 'score' in rec:
            print(f"   추천 점수: {rec['score']:.3f}")
        print(f"   영화 ID: {rec['movieId']}")
        print("-" * 50)

def __getstate__(self):
    """pickle을 위한 상태 저장"""
    state = self.__dict__.copy()
    # annoy_index 제거
    if 'annoy_index' in state:
        del state['annoy_index']
    return state

def __setstate__(self, state):
    """pickle을 위한 상태 복원"""
    self.__dict__.update(state)
    # annoy_index 재로드
    self.load_annoy_index()

def save_hybrid_model(hybrid_model, model_dir='/content/hybrid_model'):
    """하이브리드 모델 저장"""
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, 'hybrid_model.pkl')

    try:
        with open(model_path, 'wb') as f:
            pickle.dump(hybrid_model, f)
        print(f"Hybrid model saved in {model_path}")
    except Exception as e:
        print(f"Error saving hybrid model: {str(e)}")

def load_hybrid_model(model_dir='/content/hybrid_model'):
    """하이브리드 모델 로드"""
    model_path = os.path.join(model_dir, 'hybrid_model.pkl')

    try:
        with open(model_path, 'rb') as f:
            hybrid_model = pickle.load(f)
        # Annoy 인덱스 재로드
        hybrid_model.load_annoy_index()
        return hybrid_model
    except Exception as e:
        print(f"Error loading hybrid model: {str(e)}")
        return None

In [None]:
import joblib
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

class CollaborativeFiltering:
    def __init__(self, ratings_df_filtered, hyper_params):
        self.R = ratings_df_filtered
        self.num_users = len(np.unique(self.R.row))
        self.num_items = len(np.unique(self.R.col))
        self.K = hyper_params['K']
        self.alpha = hyper_params['alpha']
        self.beta = hyper_params['beta']
        self.iterations = hyper_params['iterations']
        self.verbose = hyper_params['verbose']
        self.batch_size = hyper_params['batch_size']
        self.b = np.mean(self.R.data)

    def predict_for_user(self, user_id):
        if user_id in self.user_id_map:
            user_idx = self.user_id_map[user_id]
            return self.b + self.b_u[user_idx] + self.b_d + np.dot(self.P[user_idx], self.Q.T)
        else:
            return np.full(self.num_items, self.b)

    def __getstate__(self):
        state = self.__dict__.copy()
        if 'R' in state:
            del state['R']
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        self.R = None

def load_recommender(model_path='/content/cf_model_v2.pkl'):
    """저장된 모델과 데이터를 로드합니다."""
    data = joblib.load(model_path)
    return data['model'], data['user_map'], data['movie_map'], data['movies_df']

def get_movie_recommendations(user_id, cf_model, user_map, movies_df, top_n=10):
    """특정 사용자에 대한 영화 추천을 반환합니다."""
    try:
        # 사용자 ID를 인덱스로 변환
        if user_id not in user_map:
            return {'error': f"사용자 ID {user_id}가 학습 데이터에 없습니다."}

        user_index = user_map[user_id]

        # 사용자 예측 평점 계산
        user_predictions = cf_model.predict_for_user(user_index)

        # 상위 N개 영화 선택
        top_movie_indices = np.argsort(user_predictions)[::-1][:top_n]
        top_movie_ids = [cf_model.idx_item_map[idx] for idx in top_movie_indices]

        # 추천 영화 정보 가져오기
        recommended_movies = movies_df[movies_df['movieId'].isin(top_movie_ids)]

        # 결과 포맷팅
        recommendations = []
        for idx, movie in recommended_movies.iterrows():
            movie_id = movie['movieId']
            predicted_rating = user_predictions[cf_model.item_id_map[movie_id]]
            recommendations.append({
                'title': movie['title'],
                'genres': movie['genres'],
                'predicted_rating': float(predicted_rating)
            })

        return recommendations

    except Exception as e:
        return {'error': f"추천 생성 중 오류 발생: {str(e)}"}

In [None]:
def load_model_data(cf_model_path='/content/model_data/cf_model_v2.pkl',
                   content_model_dir='/content/content_model',
                   ratings_path='/content/rating.csv'):
    """
    CF 모델과 컨텐츠 모델 데이터 로드
    """
    print("Checking file existence:")
    print(f"CF model exists: {os.path.exists(cf_model_path)}")
    print(f"Content model directory exists: {os.path.exists(content_model_dir)}")

    # CF 모델 로드 (joblib 사용)
    cf_model_data = None
    try:
        cf_model_data = joblib.load(cf_model_path)
        print("CF model loaded successfully")

        # ratings.csv 파일에서 ratings_df 로드
        if os.path.exists(ratings_path):
            ratings_df = pd.read_csv(ratings_path)

            # user_map과 movie_map에 있는 ID만 필터링
            valid_users = ratings_df['userId'].isin(cf_model_data['user_map'].keys())
            valid_movies = ratings_df['movieId'].isin(cf_model_data['movie_map'].keys())
            ratings_df = ratings_df[valid_users & valid_movies]

            # R 매트릭스 재구성
            cf_model_data['model'].R = coo_matrix(
                (ratings_df['rating'].values,
                 (ratings_df['userId'].map(cf_model_data['user_map']).values,
                  ratings_df['movieId'].map(cf_model_data['movie_map']).values)),
                shape=(len(cf_model_data['user_map']), len(cf_model_data['movie_map']))
            ).tocsr()  # CSR 형식으로 변환
            print("CF model and ratings matrix reconstructed successfully")

            # 디버깅 정보 출력
            print(f"Rating matrix shape: {cf_model_data['model'].R.shape}")
            print(f"Number of ratings: {len(ratings_df)}")

        else:
            print(f"Ratings file not found at {ratings_path}")

    except Exception as e:
        print(f"Error loading CF model: {str(e)}")
        import traceback
        print(traceback.format_exc())

    # 컨텐츠 모델 데이터 로드
    content_model_data = None
    try:
        # 피클 데이터 로드
        content_data_path = os.path.join(content_model_dir, 'content_model_data.pkl')
        with open(content_data_path, 'rb') as f:
            content_model_data = pickle.load(f)

        # Annoy 인덱스 파일 경로
        annoy_path = os.path.join(content_model_dir, 'movie_similarity.ann')
        print(annoy_path)

        # content_model_data에 annoy_path 추가
        content_model_data['annoy_path'] = annoy_path

        print("Content model loaded successfully")
    except Exception as e:
        print(f"Error loading content model: {str(e)}")
        raise

    return content_model_data, cf_model_data

In [None]:
# 모델 로드 및 초기화
try:
    content_model_data, cf_model_data = load_model_data()
    hybrid_model = HybridRecommender(content_model_data, cf_model_data)
    print("Hybrid model initialized successfully")

    save_hybrid_model(hybrid_model)

    # 테스트: 영화 제목으로 추천
    movie_recommendations = hybrid_model.recommend(
        movie_title="Dark Knight Rises, The (2012)",
        n_recommendations=10
    )
    print_recommendations(movie_recommendations, "영화 기반 추천 결과")

    # CF 모델이 있는 경우 사용자 ID로도 추천
    if cf_model_data:
        user_recommendations = hybrid_model.recommend(
            user_id=140000,
            n_recommendations=40
        )
    print_recommendations(user_recommendations, "사용자 기반 추천 결과")

except Exception as e:
    print(f"\nError during model initialization: {str(e)}")

Checking file existence:
CF model exists: True
Content model directory exists: True
CF model loaded successfully
CF model and ratings matrix reconstructed successfully
Rating matrix shape: (13649, 19100)
Number of ratings: 1949420
/content/content_model/movie_similarity.ann
Content model loaded successfully
Annoy index loaded successfully

Initialization Debug Info:
Number of movies in movies_df: 27280
Number of items in title_to_index: 27264
Sample movie titles in title_to_index: ['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)']
Hybrid model initialized successfully
Error saving hybrid model: cannot pickle 'annoy.Annoy' object

Debug - recommend_by_movie:
Input movie title: Dark Knight Rises, The (2012)
Available titles: ['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)']
Found movie index: 18312
Successfully extracted features for movie
Found similar indices: [18312, 10169, 12525, 12631, 1