In [6]:
# !pip install git+https://github.com/openai/CLIP.git

In [None]:
import os
import zipfile
from PIL import Image, ImageDraw

# 이미지 디렉토리 생성
image_dir = "/mnt/data/images"
os.makedirs(image_dir, exist_ok=True)

# 더미 이미지 생성 함수
def create_dummy_image(path, label):
    img = Image.new('RGB', (256, 256), color=(255, 255, 255))
    d = ImageDraw.Draw(img)
    d.text((10, 120), label, fill=(0, 0, 0))
    img.save(path)

# 예제용 이미지 파일 생성
image_names = [
    "earbuds.jpg", "case.jpg", "speaker.jpg", "charger.jpg",
    "headphones.jpg", "watch.jpg", "mouse.jpg", "stand.jpg",
    "camera.jpg", "tracker.jpg"
]

for name in image_names:
    create_dummy_image(os.path.join(image_dir, name), name.split(".")[0])

In [10]:
# [1] 멀티모달 추천 시스템 예제
# 목표: 이미지 + 리뷰(텍스트)를 이용해서 아이템 임베딩을 만들고, 유사 아이템 추천

# 예제 구성
# 데이터: 상품 이미지 URL, 텍스트 리뷰, 상품 ID

# 사용 기술: CLIP (이미지 + 텍스트 임베딩), FAISS (유사도 기반 검색)

import torch
import clip
from PIL import Image
import pandas as pd

# 1. CLIP 모델 불러오기
model, preprocess = clip.load("ViT-B/32", device="cuda")

# 2. 데이터셋 로딩 (상품 정보 + 리뷰 + 이미지 URL)
df = pd.read_csv("/content/drive/MyDrive/1recsys/dataset/product_data.csv")

# 이미지 경로 앞에 드라이브 디렉토리 붙이기
image_path = "/content/drive/MyDrive/1recsys/dataset"

# 3. 이미지 임베딩
def get_image_embedding(image_path):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to("cuda")
    with torch.no_grad():
        img_emb = model.encode_image(image)
    return img_emb / img_emb.norm()

# 4. 텍스트 임베딩
def get_text_embedding(text):
    text_tokens = clip.tokenize([text]).to("cuda")
    with torch.no_grad():
        text_emb = model.encode_text(text_tokens)
    return text_emb / text_emb.norm()

# 5. 멀티모달 임베딩 조합
df["image_embedding"] = image_path + df["image_path"].apply(get_image_embedding)
df["text_embedding"] = df["review"].apply(get_text_embedding)
df["multi_embedding"] = df.apply(lambda x: 0.5 * x.image_embedding + 0.5 * x.text_embedding, axis=1)

# 6. FAISS로 유사 아이템 추천
import faiss
import numpy as np

embedding_matrix = np.vstack(df["multi_embedding"].to_list()).astype("float32")
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

def recommend_similar_items(item_idx, top_k=5):
    D, I = index.search(embedding_matrix[item_idx:item_idx+1], top_k + 1)
    return I[0][1:]

similar_ids = recommend_similar_items(item_idx=10)
print("추천 상품 ID들:", df.iloc[similar_ids]["product_id"].tolist())

FileNotFoundError: [Errno 2] No such file or directory: 'images/earbuds.jpg'

In [5]:
# [2] Cold-start 대응: 임베딩 + 클러스터링 기반 추천
# 목표: 상품 리뷰로부터 텍스트 임베딩 생성 → 군집화 → cold-start 아이템을 클러스터 기반으로 추천

# 예제 구성
# 데이터: 상품 리뷰, 아이템 ID

# 사용 기술: Sentence-BERT, KMeans, cosine similarity

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. 리뷰 임베딩 생성
model = SentenceTransformer('all-MiniLM-L6-v2')
df["text_embedding"] = df["review"].apply(lambda x: model.encode(x))

# 2. KMeans 클러스터링
embeddings = np.vstack(df["text_embedding"].to_list())
kmeans = KMeans(n_clusters=10, random_state=42).fit(embeddings)
df["cluster"] = kmeans.labels_

# 3. cold-start 아이템 처리
# 새 아이템 리뷰 → 임베딩 생성 → 가장 가까운 클러스터에 배정
def assign_cluster(new_review):
    new_emb = model.encode(new_review)
    cluster_id = kmeans.predict([new_emb])[0]
    return cluster_id

new_item_review = "Great battery life and nice camera"
cluster_id = assign_cluster(new_item_review)

# 4. 클러스터 내 인기 아이템 추천
popular_items = df[df["cluster"] == cluster_id].groupby("product_id").size().sort_values(ascending=False)
print("추천 아이템:", popular_items.index[:5].tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ValueError: Buffer dtype mismatch, expected 'const double' but got 'float'

In [None]:
import streamlit as st
import pandas as pd
import torch
import clip
from PIL import Image
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
import faiss

st.title("멀티모달 추천 시스템 데모")

# Load data
df = pd.read_csv("sample_multimodal_products.csv")

# Load models
clip_model, clip_preprocess = clip.load("ViT-B/32")
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Get embeddings
def get_image_embedding(path):
    try:
        image = clip_preprocess(Image.open(path)).unsqueeze(0)
        with torch.no_grad():
            img_emb = clip_model.encode_image(image)
        return img_emb[0].numpy()
    except:
        return np.zeros(512)

def get_text_embedding(text):
    return sbert_model.encode(text)

st.subheader("전체 상품 목록")
st.dataframe(df[["product_id", "product_name", "review"]])

# Generate embeddings (for demo: simple method)
st.subheader("임베딩 및 추천 결과")

if st.button("임베딩 생성 및 추천 보기"):
    with st.spinner("임베딩 및 추천 계산 중..."):
        df["img_emb"] = df["image_path"].apply(get_image_embedding)
        df["txt_emb"] = df["review"].apply(get_text_embedding)
        df["multi_emb"] = df.apply(lambda x: 0.5 * x.img_emb + 0.5 * x.txt_emb, axis=1)

        emb_matrix = np.vstack(df["multi_emb"].to_list()).astype("float32")
        index = faiss.IndexFlatL2(emb_matrix.shape[1])
        index.add(emb_matrix)

        target_idx = 0  # 첫 번째 상품
        D, I = index.search(emb_matrix[target_idx:target_idx+1], 6)

        st.markdown(f"**기준 상품:** {df.iloc[target_idx]['product_name']}")
        st.markdown("**리뷰:** " + df.iloc[target_idx]['review'])

        st.markdown("**추천 상품 목록:**")
        for i in I[0][1:]:
            st.markdown(f"- {df.iloc[i]['product_name']} ({df.iloc[i]['review']})")

# Cold-start clustering
st.subheader(" Cold-Start 상품 클러스터링")

new_review = st.text_input("새 상품 리뷰 입력:", "Amazing audio quality and great fit")

if st.button("클러스터링 기반 추천"):
    with st.spinner("리뷰 임베딩 및 클러스터링 중..."):
        txt_embs = np.vstack(df["txt_emb"].to_list()) if "txt_emb" in df else np.vstack(df["review"].apply(get_text_embedding).to_list())
        kmeans = KMeans(n_clusters=3, random_state=42).fit(txt_embs)
        df["cluster"] = kmeans.labels_

        new_emb = sbert_model.encode(new_review)
        cluster_id = kmeans.predict([new_emb])[0]

        cluster_items = df[df["cluster"] == cluster_id]
        st.markdown(f"**추천할 클러스터 (ID: {cluster_id}) 내 인기 상품:**")
        for _, row in cluster_items.iterrows():
            st.markdown(f"- {row['product_name']} ({row['review']})")


ModuleNotFoundError: No module named 'streamlit'

In [None]:
# GMM 기반 추천 예제 코드

from sentence_transformers import SentenceTransformer
from sklearn.mixture import GaussianMixture
import pandas as pd
import numpy as np

# 1. 데이터 불러오기
df = pd.read_csv("sample_multimodal_products.csv")

# 2. 텍스트 임베딩
model = SentenceTransformer('all-MiniLM-L6-v2')
df['embedding'] = df['review'].apply(lambda x: model.encode(x))

X = np.vstack(df['embedding'].to_list())

# 3. GMM 클러스터링
gmm = GaussianMixture(n_components=3, random_state=42)
df['cluster'] = gmm.fit_predict(X)

# 4. Cold-start 아이템 리뷰 입력
new_review = "Amazing sound quality and stylish design"
new_emb = model.encode(new_review).reshape(1, -1)

# 5. 클러스터 확률 예측
probs = gmm.predict_proba(new_emb)[0]

# 6. 상위 확률 클러스터에서 추천
top_cluster = np.argmax(probs)
recommended_items = df[df['cluster'] == top_cluster].sample(n=3)

print(f"GMM 추천 (Top cluster={top_cluster}, 확률={probs[top_cluster]:.2f})")
for _, row in recommended_items.iterrows():
    print(f"- {row['product_name']} | 리뷰: {row['review']}")

# 장점
# 단일 클러스터 할당이 아닌 "확률 기반 추천" 가능
