# Import Library

In [1]:
import faiss
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

from utils.domain import fashion
from utils.item_generator import generate_items
from utils.clickstream_generator import generate_clickstream_from_metadata

# Dataset

## Item Metadata

In [6]:
output_path = Path(f"/tmp/recsys/dataset/fashion/").resolve()
output_path.mkdir(parents=True, exist_ok=True)
item_metadata_path = output_path.joinpath("item_metadata.parquet")

df = generate_items(fashion.Fashion, num_items=10000)
df.to_parquet(item_metadata_path)

## Text Vector

### SentenceTransformer

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df["title"].tolist(), normalize_embeddings=True)
embeddings = np.array(embeddings).astype("float32")
df["text_vector"] = embeddings.tolist()

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)


def find_similar_products(query_text: str, top_k: int = 5):
    query_vec = model.encode([query_text], normalize_embeddings=True).astype("float32")
    scores, indices = index.search(query_vec, top_k)

    print(f"\n[🔍 입력 상품명] {query_text}\n")
    print("[📌 유사한 상품]")
    for rank, idx in enumerate(indices[0]):
        sim_score = scores[0][rank]
        print(f"{rank+1}. {df.iloc[idx]['title']} (score: {sim_score:.4f})")


find_similar_products("화이트 반팔 티셔츠")
find_similar_products("여름 린넨 원피스")

### e5-base

In [None]:
# 요약: 한국어 상품명 임베딩 품질을 높이기 위해 다국어 검색 특화 모델(E5)을 사용하고,
# 인덱스 문서엔 "passage: ", 질의엔 "query: " 프리픽스를 적용합니다.
from sentence_transformers import SentenceTransformer
import faiss

# 한국어 포함 멀티링궐 성능이 우수한 검색 특화 임베딩 모델
# 참고: intfloat/multilingual-e5-base (문서: "passage: ", 질의: "query: ")
model = SentenceTransformer("intfloat/multilingual-e5-base")

# E5 모델 가이드에 따라 passage 프리픽스를 붙여 인덱싱
corpus_texts = ["passage: " + t for t in df["title"].tolist()]
embeddings = model.encode(corpus_texts, normalize_embeddings=True)
embeddings = np.array(embeddings).astype("float32")
df["text_vector"] = embeddings.tolist()

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)


def find_similar_products(query_text: str, top_k: int = 5):
    # E5 모델 가이드에 따라 query 프리픽스를 사용
    query_vec = model.encode(
        ["query: " + query_text], normalize_embeddings=True
    ).astype("float32")
    scores, indices = index.search(query_vec, top_k)

    print(f"\n[🔍 입력 상품명] {query_text}\n")
    print("[📌 유사한 상품]")
    for rank, idx in enumerate(indices[0]):
        sim_score = scores[0][rank]
        print(f"{rank + 1}. {df.iloc[idx]['title']} (score: {sim_score:.4f})")


find_similar_products("화이트 반팔 티셔츠")
find_similar_products("여름 린넬 원피스")

## Image Vector

## Clickstream Dataset

In [None]:
generate_clickstream_from_metadata(
    item_metadata_path=item_metadata_path,
    save_path=output_path,
    total_users=10000,
    users_per_partition=1000,
    n_sessions_per_user=5,
    actions=["click", "wishlist", "cart", "purchase"],
    action_weights=[0.8, 0.1, 0.05, 0.05],
    similarity_keys=["category", "style", "color"],
    start_date=datetime(2025, 9, 1),
    seed=42,
)

# Load Dataset

In [None]:
df = pd.read_parquet(output_path.joinpath("clickstream"))

In [None]:
df.shape

In [None]:
df.head()

# Item Frequency

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5), dpi=100)
sns.histplot(x=df["item_id"].value_counts(), binwidth=1)
plt.title("Item Frequency")
plt.xlabel("Count")
plt.ylabel("Frequency")
plt.show()

# User Frequency

In [None]:
df.head()