<a href="https://colab.research.google.com/github/unie12/recommendation/blob/main/content_recom_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pydantic import BaseModel
from typing import List
import pandas as pd
from annoy import AnnoyIndex
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
import os
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Request/Response 모델 정의
class RecommendRequest(BaseModel):
    tmdb_ids: List[str]

app = FastAPI()

# BERT 모델 및 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# 전역 변수 선언
top_50k_movies = None
annoy_index = None
features_df = None

@app.on_event("startup")
async def startup_event():
    global top_50k_movies, annoy_index, features_df
    try:
        # 절대 경로 사용
        base_path = '/home/ubuntu/model_data/contents'

        # 파일 존재 확인
        files_to_check = ['bert_top_movies.pkl', 'bert_movie_features.pkl', 'bert_movie_similarity.ann']
        for file in files_to_check:
            file_path = os.path.join(base_path, file)
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File not found: {file_path}")
            logger.info(f"Found file: {file_path}")

        # 데이터 로드
        top_50k_movies = pd.read_pickle(os.path.join(base_path, 'bert_top_movies.pkl'))
        features_df = pd.read_pickle(os.path.join(base_path, 'bert_movie_features.pkl'))

        logger.info(f"Loaded movies dataset with shape: {top_50k_movies.shape}")
        logger.info(f"Available movie IDs range: {top_50k_movies['id'].min()} - {top_50k_movies['id'].max()}")

        n_features = features_df.shape[1]
        annoy_index = AnnoyIndex(n_features, 'angular')
        annoy_index.load(os.path.join(base_path, 'bert_movie_similarity.ann'))

        logger.info("Model loaded successfully")
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise e

class MovieRecommendDTO(BaseModel):
    tmdbId: str
    title: str
    poster_path: str
    popularity: str

# BERT 임베딩 생성 함수
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def get_recommendations(movie_ids, n=30):
    logger.info(f"Received request for movie IDs: {movie_ids}")
    numeric_ids = [int(id_) for id_ in movie_ids]
    logger.info(f"Converted to numeric IDs: {numeric_ids}")

    matching_movies = top_50k_movies[top_50k_movies['id'].isin(numeric_ids)]
    logger.info(f"Found {len(matching_movies)} matching movies")

    if len(matching_movies) == 0:
        raise HTTPException(status_code=404, detail="No movies found")

    # 각 영화별 추천 결과 저장
    final_recommendations = set()
    movie_info = {}
    excluded_ids = set(movie_ids)

    # 각 영화당 최소 추천 수 계산
    min_recommendations_per_movie = max(n // len(movie_ids), 5)

    # 각 입력 영화별 추천 처리
    for idx in matching_movies.index:
        try:
            # 유사한 영화 검색
            similar_indices = annoy_index.get_nns_by_item(idx, n*3, include_distances=True)
            indices, distances = similar_indices[0], similar_indices[1]

            # 현재 영화의 추천 목록
            current_recommendations = []

            for similar_idx, distance in zip(indices, distances):
                similar_movie = top_50k_movies.iloc[similar_idx]
                similar_id = str(similar_movie['id'])

                if similar_id in excluded_ids:
                    continue

                similarity_score = 1 / (1 + distance)

                current_recommendations.append((similar_id, similarity_score))

                # 영화 정보 저장
                if similar_id not in movie_info:
                    movie_info[similar_id] = {
                        'tmdbId': similar_id,
                        'title': str(similar_movie['title']),
                        'poster_path': "",
                        'popularity': str(similar_movie['popularity']),
                        'release_date': str(similar_movie.get('release_date', ''))
                    }

            # 상위 추천 영화 선택
            top_recommendations = sorted(
                current_recommendations,
                key=lambda x: x[1],
                reverse=True
            )[:min_recommendations_per_movie]

            # 추천 목록에 추가
            for movie_id, score in top_recommendations:
                final_recommendations.add((movie_id, score))

        except Exception as e:
            logger.error(f"Error processing movie: {e}")
            continue

    if not final_recommendations:
        raise HTTPException(status_code=404, detail="No recommendations found")

    # 최종 추천 목록 생성
    sorted_recommendations = sorted(
        final_recommendations,
        key=lambda x: x[1],
        reverse=True
    )[:n]

    # 결과 형식 변환
    result = []
    for movie_id, score in sorted_recommendations:
        movie_data = movie_info[movie_id].copy()
        movie_data['popularity'] = str(score)
        result.append(movie_data)

    logger.info(f"Returning {len(result)} recommendations")
    return result

@app.post("/recommend", response_model=List[MovieRecommendDTO])
async def recommend(request: RecommendRequest):
    if not request.tmdb_ids:
        raise HTTPException(status_code=400, detail="tmdb_ids list is required")

    try:
        # 입력 영화 수에 따라 추천 수 조정
        n = max(30, 10 * len(request.tmdb_ids))
        recommendations = get_recommendations(request.tmdb_ids, n=n)

        if not recommendations:
            raise HTTPException(status_code=404, detail="No recommendations found")

        return [MovieRecommendDTO(**movie) for movie in recommendations]

    except Exception as e:
        logger.error(f"Recommendation error: {str(e)}")
        raise HTTPException(status_code=500, detail="Internal server error")
