<a href="https://colab.research.google.com/github/unie12/recommendation/blob/main/server_hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix
import torch
from sklearn.preprocessing import MinMaxScaler
import os
import pickle
from annoy import AnnoyIndex

class CollaborativeFiltering:
    def __init__(self, ratings_df_filtered, hyper_params):
        self.R = ratings_df_filtered
        self.num_users = len(np.unique(self.R.row))
        self.num_items = len(np.unique(self.R.col))
        self.K = hyper_params['K']
        self.alpha = hyper_params['alpha']
        self.beta = hyper_params['beta']
        self.iterations = hyper_params['iterations']
        self.verbose = hyper_params['verbose']
        self.batch_size = hyper_params['batch_size']
        self.b = np.mean(self.R.data)

    def predict_for_user(self, user_id):
        if user_id in self.user_id_map:
            user_idx = self.user_id_map[user_id]
            return self.b + self.b_u[user_idx] + self.b_d + np.dot(self.P[user_idx], self.Q.T)
        else:
            return np.full(self.num_items, self.b)

    def __getstate__(self):
        state = self.__dict__.copy()
        if 'R' in state:
            del state['R']
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        self.R = None

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import torch
from sklearn.preprocessing import MinMaxScaler
import os
import pickle
from annoy import AnnoyIndex
class HybridRecommender:
    def __init__(self, content_model_data, cf_model_data=None, content_weight=0.5):
        self.content_weight = content_weight
        self.cf_weight = 1 - content_weight
        self.content_model_data = content_model_data

        # TMDB ID와 MovieLens ID 매핑 테이블 초기화
        self.tmdb_to_movieid = {}
        self.movieid_to_tmdb = {}

        self.feature_cache = {}  # 특성 벡터 캐시 추가

        # CF 모델 데이터가 있는 경우
        if cf_model_data is not None:
            self.cf_model = cf_model_data['model']
            self.movies_df = cf_model_data['movies_df']
            self.user_map = cf_model_data['user_map']
            self.movie_map = cf_model_data['movie_map']

            # ID 매핑이 cf_model_data에 포함되어 있는 경우
            if 'tmdb_to_movieid' in cf_model_data and 'movieid_to_tmdb' in cf_model_data:
                # float 형태의 TMDB ID를 문자열로 변환
                self.tmdb_to_movieid = {
                    str(int(float(k))): v
                    for k, v in cf_model_data['tmdb_to_movieid'].items()
                    if pd.notna(k) and pd.notna(v)
                }
                self.movieid_to_tmdb = {
                    k: str(int(float(v)))
                    for k, v in cf_model_data['movieid_to_tmdb'].items()
                    if pd.notna(k) and pd.notna(v)
                }
                print("Using ID mappings from cf_model_data")
                print(f"Number of TMDB to MovieLens mappings: {len(self.tmdb_to_movieid)}")
                print(f"Sample mappings after filtering: {list(self.tmdb_to_movieid.items())[:5]}")

                # 매핑 정보 복사본 저장
                self._tmdb_to_movieid_backup = self.tmdb_to_movieid.copy()
                self._movieid_to_tmdb_backup = self.movieid_to_tmdb.copy()
        else:
            self.cf_model = None

        self.annoy_index = None
        self.load_annoy_index()

    def load_annoy_index(self):
        """Annoy 인덱스 로드"""
        try:
            # features_df에서 차원 가져오기
            if 'features_df' in self.content_model_data:
                vector_dimension = self.content_model_data['features_df'].shape[1]
            else:
                # bert_movie_features.pkl에서 차원 가져오기
                features_path = '/home/ubuntu/model_data/contents/bert_movie_features.pkl'
                if os.path.exists(features_path):
                    with open(features_path, 'rb') as f:
                        features_df = pickle.load(f)
                    vector_dimension = features_df.shape[1]
                else:
                    vector_dimension = 768  # 기본값 사용

            print(f"Using vector dimension: {vector_dimension}")
            self.annoy_index = AnnoyIndex(vector_dimension, 'angular')
                        # content_model_data에서 annoy_path 가져오기
            annoy_path = self.content_model_data.get('annoy_path')
            if not annoy_path:
                # 기본 경로 설정
                annoy_path = '/home/ubuntu/model_data/contents/bert_movie_similarity.ann'

            if not os.path.exists(annoy_path):
                print(f"Annoy index file not found at: {annoy_path}")
                self.annoy_index = None
                return

            try:
                self.annoy_index.load(annoy_path)
                print(f"Annoy index loaded successfully from {annoy_path}")
            except Exception as e:
                print(f"Error loading Annoy index file: {str(e)}")
                self.annoy_index = None

        except Exception as e:
            print(f"Error initializing Annoy index: {str(e)}")
            self.annoy_index = None

    def recommend_realtime(self, user_ratings, n_recommendations=10):
        """실시간 하이브리드 추천"""
        try:
            # 유효한 평점 필터링
            valid_ratings = []
            invalid_ids = []
            for rating in user_ratings:
                tmdb_id = str(rating['tmdb_id'])
                if tmdb_id in self.tmdb_to_movieid:
                    movieid = self.tmdb_to_movieid[tmdb_id]
                    valid_ratings.append({
                        'tmdb_id': tmdb_id,
                        'rating': rating['rating'],
                        'movieId': movieid
                    })
                else:
                    invalid_ids.append(tmdb_id)

            if len(valid_ratings) < 3:
                print("Not enough valid ratings for hybrid recommendation")
                return []

            # 각 입력 영화별 추천 결과 저장
            recommendations_per_movie = {rating['movieId']: [] for rating in valid_ratings}
            min_recommendations_per_movie = max(n_recommendations // len(valid_ratings), 5)

            # 후보 영화 선정 (이미 평가한 영화 제외)
            rated_movie_ids = {r['movieId'] for r in valid_ratings}
            candidate_movies = [
                movieid for movieid in self.tmdb_to_movieid.values()
                if movieid not in rated_movie_ids
            ]

            # 각 입력 영화마다 추천 생성
            for source_rating in valid_ratings:
                source_features = self.create_hybrid_features(source_rating['movieId'])

                # 현재 입력 영화와 후보 영화들의 유사도 계산
                movie_similarities = []
                for candidate_id in candidate_movies:
                    candidate_features = self.create_hybrid_features(candidate_id)
                    similarity = self._calculate_similarity(source_features, candidate_features)
                    if similarity > 0:
                        movie_similarities.append((candidate_id, similarity))

                # 상위 추천 영화 선택
                top_similar = sorted(movie_similarities,
                                  key=lambda x: x[1],
                                  reverse=True)[:min_recommendations_per_movie]
                recommendations_per_movie[source_rating['movieId']].extend(top_similar)

            # 모든 추천 결과 통합
            all_recommendations = []
            seen_movies = set()

            # 각 입력 영화의 추천 결과를 번갈아가며 추가
            while any(recommendations_per_movie.values()):
                for source_id in recommendations_per_movie:
                    if recommendations_per_movie[source_id]:
                        movieid, score = recommendations_per_movie[source_id].pop(0)
                        if movieid not in seen_movies:
                            seen_movies.add(movieid)
                            try:
                                movie_info = self.movies_df[self.movies_df['movieId'] == movieid].iloc[0]
                                source_movie = self.movies_df[
                                    self.movies_df['movieId'] == source_id
                                ].iloc[0]['title']

                                all_recommendations.append({
                                    'tmdbId': str(self.movieid_to_tmdb[movieid]),
                                    'title': movie_info['title'],
                                    'poster_path': "",
                                    'popularity': str(score),
                                    'recommendation_type': 'hybrid',
                                    'recommendedFrom': source_movie,
                                    'similarity': float(score)
                                })
                            except Exception as e:
                                print(f"Error formatting recommendation for movie {movieid}: {str(e)}")
                                continue

                    if len(all_recommendations) >= n_recommendations:
                        break
                if len(all_recommendations) >= n_recommendations:
                    break

            print(f"Generated {len(all_recommendations)} hybrid recommendations")
            return all_recommendations[:n_recommendations]

        except Exception as e:
            print(f"Error in recommend_realtime: {str(e)}")
            import traceback
            print(traceback.format_exc())
            return []


    def recommend_by_content(self, tmdb_id, n_recommendations=10):
        """TMDB ID 기반 컨텐츠 추천"""
        try:
            if str(tmdb_id) in self.tmdb_to_movieid:
                movieid = self.tmdb_to_movieid[str(tmdb_id)]
                movie_title = self.movies_df[self.movies_df['movieId'] == movieid]['title'].iloc[0]
                return self.recommend_by_movie(movie_title, n_recommendations)
            else:
                print(f"TMDB ID {tmdb_id} not found in mapping")
                return []
        except Exception as e:
            print(f"Error in recommend_by_content: {str(e)}")
            return []

    def _load_id_mappings(self):
        """TMDB ID와 MovieLens ID 매핑 로드"""
        links_df = pd.read_csv('link.csv')
        self.tmdb_to_movieid = dict(zip(links_df['tmdbId'], links_df['movieId']))
        self.movieid_to_tmdb = dict(zip(links_df['movieId'], links_df['tmdbId']))

    def get_content_features(self, movie_id):
        """컨텐츠 기반 특성 추출"""
        try:
            # 영화의 BERT 임베딩 가져오기
            movie_title = self.movies_df[self.movies_df['movieId'] == movie_id]['title'].iloc[0]
            idx = self.content_model_data['title_to_index'][movie_title]
            embedding = self.annoy_index.get_item_vector(idx)
            return np.array(embedding)
        except (KeyError, IndexError):
            return np.zeros(768)  # BERT base 차원에 맞춤

    def get_cf_features(self, movie_id):
        """협업 필터링 특성 추출"""
        try:
            item_idx = self.cf_model.item_id_map[movie_id]
            # 잠재 요인과 편향 결합
            cf_features = np.concatenate([
                self.cf_model.Q[item_idx],  # 아이템 잠재 요인
                [self.cf_model.b_d[item_idx]]  # 아이템 편향
            ])
            return cf_features
        except KeyError:
            return np.zeros(self.cf_model.K + 1)

    def create_hybrid_features(self, movie_id):
        """하이브리드 특성 생성 (캐싱 적용)"""
        # 캐시된 특성이 있으면 반환
        if movie_id in self.feature_cache:
            return self.feature_cache[movie_id]

        # 없으면 계산하고 캐시에 저장
        content_features = self.get_content_features(movie_id)
        cf_features = self.get_cf_features(movie_id)

        content_features = self._normalize_features(content_features)
        cf_features = self._normalize_features(cf_features)

        weighted_content = content_features * self.content_weight
        weighted_cf = cf_features * self.cf_weight

        hybrid_features = np.concatenate([
            weighted_content,
            weighted_cf,
            [np.dot(weighted_content, weighted_content.T)],
            [np.dot(weighted_cf, weighted_cf.T)]
        ])

        # 계산된 특성을 캐시에 저장
        self.feature_cache[movie_id] = hybrid_features
        return hybrid_features

    def _normalize_features(self, features):
        """특성 정규화"""
        norm = np.linalg.norm(features)
        return features / norm if norm != 0 else features

    def build_similarity_matrix(self, movie_ids):
        """영화 간 유사도 행렬 생성"""
        n_movies = len(movie_ids)
        similarity_matrix = np.zeros((n_movies, n_movies))

        for i, movie1 in enumerate(movie_ids):
            features1 = self.create_hybrid_features(movie1)
            for j, movie2 in enumerate(movie_ids[i:], i):
                features2 = self.create_hybrid_features(movie2)
                similarity = self._calculate_similarity(features1, features2)
                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity

        return similarity_matrix

    def _calculate_similarity(self, features1, features2):
        """코사인 유사도 계산"""
        return np.dot(features1, features2) / (np.linalg.norm(features1) * np.linalg.norm(features2))

    def recommend_by_movie(self, movie_title, n_recommendations=10):
        """영화 제목만으로 추천"""
        print(f"\nDebug - recommend_by_movie:")
        print(f"Input movie title: {movie_title}")
        print(f"Available titles: {list(self.content_model_data['title_to_index'].keys())[:5]}")

        try:
            # 입력 영화의 특성 추출
            if movie_title not in self.content_model_data['title_to_index']:
                print(f"Movie title not found in title_to_index: {movie_title}")
                return []

            idx = self.content_model_data['title_to_index'][movie_title]
            print(f"Found movie index: {idx}")

            query_features = np.array(self.annoy_index.get_item_vector(idx))
            print(f"Successfully extracted features for movie")

            # Annoy를 사용한 유사 영화 검색
            similar_indices = self.annoy_index.get_nns_by_vector(
                query_features, n_recommendations + 1
            )
            print(f"Found similar indices: {similar_indices}")

            recommendations = []
            for idx in similar_indices:
                similar_title = self.content_model_data['index_to_title'][idx]
                if similar_title != movie_title:
                    movie_info = next(
                        (m for m in self.content_model_data['movie_data']
                         if m['title'] == similar_title),
                        None
                    )

                    if movie_info:
                        similarity = 1 - self.annoy_index.get_distance(
                                                        self.content_model_data['title_to_index'][movie_title],
                            idx
                        )

                        recommendations.append({
                            'movieId': movie_info['movieId'],
                            'title': movie_info['title'],
                            'genres': movie_info['genres'],
                            'similarity': similarity
                        })

            print(f"Generated {len(recommendations)} recommendations")
            return recommendations[:n_recommendations]

        except Exception as e:
            print(f"Error in recommend_by_movie: {str(e)}")
            return []

    def recommend(self, user_id=None, movie_title=None, n_recommendations=10):
        """사용자 ID 또는 영화 제목으로 추천"""
        if movie_title:
            return self.recommend_by_movie(movie_title, n_recommendations)
        elif user_id and self.cf_model:
            return self.recommend_by_user(user_id, n_recommendations)
        else:
            raise ValueError("Either user_id or movie_title must be provided")


    def recommend_by_user(self, user_id, n_recommendations=10):
        """하이브리드 추천 생성"""
        print(f"\nDebug - recommend_by_user:")
        print(f"Input user_id: {user_id}")

        try:
            if user_id not in self.user_map:
                print(f"User ID {user_id} not found in user_map")
                return []

            mapped_user_id = self.user_map[user_id]
            print(f"Mapped user_id: {mapped_user_id}")

            user_watched_movies = self.movies_df[self.movies_df['movieId'].isin(
                self.cf_model.R[self.cf_model.user_id_map[mapped_user_id]].indices
            )]['movieId'].unique()

            print(f"Found {len(user_watched_movies)} movies watched by user")

            if len(user_watched_movies) == 0:
                print("No watched movies found for user")
                return []

            # 모든 영화에 대한 하이브리드 특성 생성
            all_movies = self.movies_df['movieId'].unique()
            print(f"Total number of movies to process: {len(all_movies)}")

            all_features = {}
            for movie_id in all_movies:
                all_features[movie_id] = self.create_hybrid_features(movie_id)

            # 사용자 프로필 생성
            user_profile = np.zeros_like(list(all_features.values())[0])
            for movie_id in user_watched_movies:
                if movie_id in all_features:
                    user_profile += all_features[movie_id]
            user_profile = self._normalize_features(user_profile)

            # 추천 점수 계산
            scores = {}
            for movie_id in all_movies:
                if movie_id not in user_watched_movies and movie_id in all_features:
                    similarity = self._calculate_similarity(user_profile, all_features[movie_id])
                    scores[movie_id] = similarity

            print(f"Generated scores for {len(scores)} movies")

            # 상위 N개 영화 선택
            top_movies = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]

            # 결과 포맷팅
            recommendations = []
            for movie_id, score in top_movies:
                movie_info = self.movies_df[self.movies_df['movieId'] == movie_id].iloc[0]
                recommendations.append({
                    'movieId': movie_id,
                    'title': movie_info['title'],
                    'score': score,
                    'genres': movie_info['genres']
                })

            print(f"Generated {len(recommendations)} recommendations")
            return recommendations

        except Exception as e:
            print(f"Error in recommend_by_user: {str(e)}")
            import traceback
            print(traceback.format_exc())
            return []

    def update_user_ratings(self, user_ratings):
        """
        실시간 사용자 평가 데이터 처리
        user_ratings: List[Dict] - [{'tmdb_id': str, 'rating': float}, ...]
        """
        # TMDB ID를 MovieLens ID로 변환
        movieids = []
        ratings = []

        for rating in user_ratings:
            tmdb_id = rating['tmdb_id']
            if tmdb_id in self.tmdb_to_movieid:
                movieids.append(self.tmdb_to_movieid[tmdb_id])
                ratings.append(rating['rating'])

        if not movieids:
            return None

        # 임시 사용자 프로필 생성
        temp_user_vector = np.zeros(self.cf_model.K)

        # 평가된 영화들의 잠재 요인 평균으로 사용자 프로필 생성
        for movieid, rating in zip(movieids, ratings):
            if movieid in self.cf_model.item_id_map:
                item_idx = self.cf_model.item_id_map[movieid]
                temp_user_vector += self.cf_model.Q[item_idx] * (rating - self.cf_model.b - self.cf_model.b_d[item_idx])

        if len(movieids) > 0:
            temp_user_vector /= len(movieids)

        return temp_user_vector


import sys
sys.modules['__main__'].HybridRecommender = HybridRecommender

import sys
sys.modules['__main__'].CollaborativeFiltering = CollaborativeFiltering
sys.modules['__main__'].HybridRecommender = HybridRecommender