In [None]:

# 2) PyTorch 2.0.1 + CUDA 11.8 버전으로 설치
!pip install --no-cache-dir -q \
    torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 \
    --index-url https://download.pytorch.org/whl/cu118

# 3) 나머지 라이브러리 설치
!pip install --no-cache-dir -q \
    sentence-transformers pandas tqdm aiohttp python-dotenv scikit-learn xgboost shap


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m245.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m180.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m186.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m275.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.3/132.3 kB[0m [31m281.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lit (setup.py) ... [?25l[?25hdone


In [None]:
# (옵션) Google Drive 마운트 예시
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Torch version   :", torch.__version__)

import pandas as pd
import numpy as np
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from datetime import datetime
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


CUDA available: True
Torch version   : 2.0.1+cu118




In [None]:
CSV_PATH = "/content/drive/MyDrive/movies_2015_plus_full_filtered.csv"
df = pd.read_csv(CSV_PATH)

for col in ["overview", "directors", "cast_top5", "genres", "release_date"]:
    df[col] = df[col].fillna("").astype(str)

# 감독·배우·장르 사전 구축
all_directors = set()
all_actors    = set()
for d_list in df["directors"]:
    for d in d_list.split(","):
        name = d.strip()
        if name:
            all_directors.add(name)
for a_list in df["cast_top5"]:
    for a in a_list.split(","):
        name = a.strip()
        if name:
            all_actors.add(name)
all_directors = sorted(all_directors)
all_actors    = sorted(all_actors)
director2idx  = {name: idx for idx, name in enumerate(all_directors)}
actor2idx     = {name: idx for idx, name in enumerate(all_actors)}

all_genres = set()
for gen_list in df["genres"]:
    for g in gen_list.split(","):
        genre = g.strip()
        if genre:
            all_genres.add(genre)
all_genres = sorted(all_genres)
genre2idx = {g: i for i, g in enumerate(all_genres)}

# 개봉일 → day_norm, year_idx
def parse_release_date(dt_str):
    if not dt_str:
        return 0.0, 0
    try:
        dt = datetime.strptime(dt_str, "%Y-%m-%d")
    except:
        return 0.0, 0
    year = dt.year
    year_idx = max(year - 2014, 0)  # 2015→1, 2016→2,...
    total_days = datetime(year, 12, 31).timetuple().tm_yday
    doy = dt.timetuple().tm_yday
    day_norm = (doy - 1) / (total_days - 1) if total_days > 1 else 0.0
    return day_norm, year_idx

day_norm_list = []
year_idx_list = []
for rd in df["release_date"]:
    dn, yi = parse_release_date(rd)
    day_norm_list.append(dn)
    year_idx_list.append(yi)
df["day_norm"] = day_norm_list
df["year_idx"] = year_idx_list


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
overview_encoder = SentenceTransformer(
    "sentence-transformers/all-mpnet-base-v2", device=device
)
overview_encoder.max_seq_length = 384

batch_size = 32
overview_embeddings = []
for i in tqdm(range(0, len(df), batch_size), desc="Embedding overview"):
    batch_texts = df["overview"].iloc[i : i + batch_size].tolist()
    embs = overview_encoder.encode(
        batch_texts,
        convert_to_numpy=True,
        show_progress_bar=False,
        batch_size=len(batch_texts)
    )
    overview_embeddings.append(embs)

overview_matrix = np.vstack(overview_embeddings)
print("Overview matrix shape:", overview_matrix.shape)  # (n_movies, 768)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding overview: 100%|██████████| 32/32 [00:08<00:00,  3.75it/s]

Overview matrix shape: (1019, 768)





In [None]:
# 셀 5) Incremental PCA로 768→256 차원 축소 (pca_batch_size=256)
n_components = 256
pca_batch_size = 256  # PCA 전용 배치 크기: 반드시 n_components ≤ pca_batch_size

ipca = IncrementalPCA(n_components=n_components, batch_size=pca_batch_size)

# 5-1) partial_fit
for i in tqdm(range(0, overview_matrix.shape[0], pca_batch_size), desc="IPCA fit"):
    batch_data = overview_matrix[i : i + pca_batch_size]
    ipca.partial_fit(batch_data)

# 5-2) transform
overview_reduced = np.zeros(
    (overview_matrix.shape[0], n_components), dtype=np.float32
)
for i in tqdm(range(0, overview_matrix.shape[0], pca_batch_size), desc="IPCA transform"):
    batch_data = overview_matrix[i : i + pca_batch_size]
    overview_reduced[i : i + pca_batch_size] = ipca.transform(batch_data)

print("Reduced overview shape:", overview_reduced.shape)  # (n_movies, 256)


IPCA fit: 100%|██████████| 4/4 [00:02<00:00,  1.91it/s]
IPCA transform: 100%|██████████| 4/4 [00:00<00:00, 90.93it/s]

Reduced overview shape: (1019, 256)





In [None]:
# 셀 6) 메타데이터(감독·배우·장르·날짜) 결합
num_directors = len(all_directors)
num_actors    = len(all_actors)
director_emb_layer = nn.Embedding(num_directors, 32).to(device)
actor_emb_layer    = nn.Embedding(num_actors, 32).to(device)

feature_dim = n_components + 32 + 32 + len(all_genres) + 2
final_feature_matrix = np.zeros((len(df), feature_dim), dtype=np.float32)
embed_batch_size = globals().get("EMBED_BATCH_SIZE", 32)  # 없으면 32로 기본값
for i in tqdm(range(0, len(df), embed_batch_size), desc="Combine features"):
    batch_slice = slice(i, min(i + embed_batch_size, len(df)))
    red_batch = overview_reduced[batch_slice]  # (batch, 256)

    other_batch_list = []
    for _, row in df.iloc[batch_slice].iterrows():
        dir_names = [d.strip() for d in row["directors"].split(",") if d.strip()]
        if dir_names:
            dir_idxs = [director2idx[name] for name in dir_names if name in director2idx]
            if dir_idxs:
                tensor = torch.LongTensor(dir_idxs).to(device)
                with torch.no_grad():
                    v_dirs = director_emb_layer(tensor).mean(dim=0).cpu().numpy()
            else:
                v_dirs = np.zeros(32, dtype=np.float32)
        else:
            v_dirs = np.zeros(32, dtype=np.float32)

        actor_names = [a.strip() for a in row["cast_top5"].split(",") if a.strip()]
        if actor_names:
            actor_idxs = [actor2idx[name] for name in actor_names if name in actor2idx]
            if actor_idxs:
                tensor = torch.LongTensor(actor_idxs).to(device)
                with torch.no_grad():
                    v_actors = actor_emb_layer(tensor).mean(dim=0).cpu().numpy()
            else:
                v_actors = np.zeros(32, dtype=np.float32)
        else:
            v_actors = np.zeros(32, dtype=np.float32)

        v_genre = np.zeros(len(all_genres), dtype=np.float32)
        for g in row["genres"].split(","):
            genre = g.strip()
            if genre in genre2idx:
                v_genre[genre2idx[genre]] = 1.0

        dn = float(row["day_norm"])
        yi = float(row["year_idx"])
        other_vec = np.concatenate(
            [v_dirs, v_actors, v_genre, np.array([dn, yi], dtype=np.float32)], axis=0
        )
        other_batch_list.append(other_vec)

    other_batch = np.vstack(other_batch_list)  # (batch, 32+32+#genres+2)
    final_feature_matrix[batch_slice] = np.hstack([red_batch, other_batch])

print("Final feature matrix shape:", final_feature_matrix.shape)


Combine features: 100%|██████████| 32/32 [00:00<00:00, 59.20it/s]

Final feature matrix shape: (1019, 341)





In [None]:
# 셀 7) 최종 행렬 저장
np.save("movie_feature_matrix_reduced.npy", final_feature_matrix)
print("Saved reduced embeddings as movie_feature_matrix_reduced.npy")


Saved reduced embeddings as movie_feature_matrix_reduced.npy


In [None]:
# --- 경로(필요하면 수정) ------------------------------------------
CSV_PATH   = "/content/drive/MyDrive/movies_2015_plus_full_filtered.csv"      # 원본 메타데이터
EMB_NPY    = "movie_feature_matrix_reduced.npy"         # 저장된 임베딩 행렬
OUT_CSV    = "movie_embedding_lookup.csv"               # 결과 CSV
# -----------------------------------------------------------------

import numpy as np
import pandas as pd

# 1) 메타데이터·임베딩 로드
df_meta = pd.read_csv(CSV_PATH)
emb_mat = np.load(EMB_NPY)            # shape: (n_movies, d)

# 2) 장르 리스트를 메타데이터에서 다시 생성
all_genres = sorted(
    {g.strip() for txt in df_meta["genres"].fillna("") for g in txt.split(",") if g.strip()}
)

# 3) 열 이름 빌드 (original order와 동일: pca256 → 감독32 → 배우32 → 장르 → 날짜)
cols = (
    [f"overview_pca_{i+1}" for i in range(256)]       # 0–255
  + [f"director_emb_{i}"    for i in range(32)]       # 256–287
  + [f"actor_emb_{i}"       for i in range(32)]       # 288–319
  + [f"genre_{g}"           for g in all_genres]      # 320–…
  + ["day_norm", "year_idx"]                          # 마지막 2개
)

# 4) 임베딩 DataFrame
df_emb = pd.DataFrame(emb_mat, columns=cols)

# 5) 식별자 컬럼 붙이기 (있는 것만)
id_cols = [c for c in ["tmdb_id", "title", "release_date"] if c in df_meta.columns]
df_out  = pd.concat([df_meta[id_cols].reset_index(drop=True), df_emb], axis=1)

# 6) CSV 저장
df_out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print(f"✅  Saved → {OUT_CSV}  (rows={df_out.shape[0]:,}, cols={df_out.shape[1]:,})")


✅  Saved → movie_embedding_lookup.csv  (rows=1,019, cols=344)
