In [3]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sentence_transformers import SentenceTransformer
from surprise import SVD, Dataset, Reader
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType


In [35]:
movies = pd.read_csv("data/movies.csv")   # movieId, title, genres
ratings = pd.read_csv("data/ratings.csv") # userId, movieId, rating
tags   = pd.read_csv("data/tags.csv")     # userId, movieId, tag
print(movies.columns)


Index(['movieId', 'title', 'genres'], dtype='object')


In [5]:
# -------------------------------
# 3. Preprocess tags
# -------------------------------
tags['tag'] = tags['tag'].astype(str).str.lower().str.strip()
tags = tags[tags['tag'].str.len() > 2]  # drop sangat pendek

# Filter rare tags
tag_counts = tags['tag'].value_counts()
min_freq = 5
valid_tags = set(tag_counts[tag_counts >= min_freq].index)
tags = tags[tags['tag'].isin(valid_tags)]

# Aggregate tags per movie (top-N)
def top_tags_for_movie(df, top_n=10):
    counts = Counter(df['tag'])
    most_common = [t for t, _ in counts.most_common(top_n)]
    return " ".join(most_common)

movie_tags = tags.groupby("movieId").apply(top_tags_for_movie).reset_index()
movie_tags.columns = ["movieId", "tags_text"]


  movie_tags = tags.groupby("movieId").apply(top_tags_for_movie).reset_index()


In [6]:
# -------------------------------
# 4. Tags Embedding (SBERT)
# -------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")

# Merge movie info + tags
movie_embeddings = pd.merge(movies, movie_tags, on="movieId", how="left")
movie_embeddings["tags_text"] = movie_embeddings["tags_text"].fillna("")
movie_embeddings["embedding"] = model.encode(movie_embeddings["tags_text"].tolist(), show_progress_bar=True).tolist()


Batches: 100%|██████████| 2738/2738 [03:31<00:00, 12.97it/s]


In [7]:
# -------------------------------
# 5. Collaborative Filtering (SVD Surprise)
# -------------------------------
# Filter users/movies dengan sedikit rating
min_user_ratings = 5
min_movie_ratings = 10

filtered_ratings = ratings.groupby("userId").filter(lambda x: len(x) >= min_user_ratings)
filtered_ratings = filtered_ratings.groupby("movieId").filter(lambda x: len(x) >= min_movie_ratings)

# Train Surprise SVD
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(filtered_ratings[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

svd = SVD(n_factors=64, random_state=42)
svd.fit(trainset)

# Extract movie embeddings (qi matrix)
movie_cf_embeddings = {trainset.to_raw_iid(i): svd.qi[i] for i in range(len(trainset.all_items()))}

cf_df = pd.DataFrame([
    {"movieId": int(mid), "cf_embedding": movie_cf_embeddings[mid]}
    for mid in movie_cf_embeddings
])


In [36]:
# -------------------------------
# 6. Encode genres (one-hot)
# -------------------------------
movies['genres'] = movies['genres'].apply(lambda g: g.split('|') if isinstance(g, str) else [])
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genres'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)
movies = pd.concat([movies[['movieId', 'title']], genre_df], axis=1)


In [131]:
print("Genres:", mlb.classes_)
print("Number of dimensions:", len(mlb.classes_))

Genres: ['(no genres listed)' 'Action' 'Adventure' 'Animation' 'Children' 'Comedy'
 'Crime' 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'IMAX'
 'Musical' 'Mystery' 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']
Number of dimensions: 20


In [37]:
# -------------------------------
# 7. Merge all sources → Hybrid Embedding
# -------------------------------
genre_cols = [col for col in movies.columns if col not in ['movieId', 'title']]
hybrid_df = movie_embeddings.merge(cf_df, on="movieId", how="inner")
hybrid_df = hybrid_df.merge(movies[['movieId','title'] + genre_cols], on='movieId', how='left')

def build_hybrid(row):
    cf_vec = row['cf_embedding']          # 64-dim
    tag_vec = row['embedding']            # 384-dim
    genre_vec = row[mlb.classes_].values  # ~20-dim
    return np.concatenate([cf_vec, tag_vec, genre_vec])

hybrid_df['hybrid_embedding'] = hybrid_df.apply(build_hybrid, axis=1)
print("Hybrid embedding dimension:", len(hybrid_df['hybrid_embedding'].iloc[0]))


Hybrid embedding dimension: 468


In [47]:
print(hybrid_df.columns)

Index(['movieId', 'genres', 'tags_text', 'embedding', 'cf_embedding',
       '(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western', 'hybrid_embedding', 'title'],
      dtype='object')


In [42]:
# -------------------------------
# 8. Connect Milvus + Collection
# -------------------------------
connections.connect(alias="default", host="localhost", port="19530")

fields = [
    FieldSchema(name="movieId", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=len(hybrid_df['hybrid_embedding'].iloc[0]))
]
schema = CollectionSchema(fields, description="Hybrid MovieLens Embeddings")

collection_name = "movielens_hybrid"
try:
    collection = Collection(name=collection_name, schema=schema, using="default", shards_num=1)
except:
    collection = Collection(name=collection_name)

collection.drop()
collection = Collection(name=collection_name, schema=schema, using="default", shards_num=1)


In [107]:
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType, utility
import numpy as np

# -------------------------------
# 1. Hapus collection lama jika ada
# -------------------------------
if utility.has_collection("movielens_hybrid"):
    utility.drop_collection("movielens_hybrid")
    print("Existing collection dropped")

# -------------------------------
# 2. Define schema
# -------------------------------
fields = [
    FieldSchema(name="movieId", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=200),  # max_length sesuai panjang title
    FieldSchema(name="hybrid_embedding", dtype=DataType.FLOAT_VECTOR, dim=468)
]

schema = CollectionSchema(fields, description="MovieLens hybrid embeddings")

collection = Collection(name="movielens_hybrid", schema=schema)
print("New collection created")

# -------------------------------
# 3. Prepare entities
# -------------------------------
ids = hybrid_df["movieId"].astype(int).tolist()
titles = hybrid_df["title"].astype(str).tolist()

# Normalisasi tiap embedding (cosine butuh ini)
def normalize(vec):
    v = np.array(vec, dtype=np.float32)
    norm = np.linalg.norm(v)
    return v / norm if norm > 0 else v

embeddings = np.vstack(
    hybrid_df['hybrid_embedding'].apply(normalize).values
).astype(np.float32)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
entities = [ids, titles, embeddings]

# -------------------------------
# 4. Insert embeddings
# -------------------------------
collection.insert(entities)
print(f"Inserted {len(ids)} embeddings")

# -------------------------------
# 5. Create index
# -------------------------------
index_params = {
    "metric_type": "COSINE",
    "index_type": "HNSW",
    "params": {"M": 8, "efConstruction": 64}
}

collection.create_index("hybrid_embedding", index_params)
print("Index created")
print(collection.indexes)
print(collection.schema)

# ------------------------------
# 6. Load collection
# -------------------------------
collection.load()
print("Collection loaded and ready for search")



Existing collection dropped
New collection created
Inserted 31961 embeddings
Index created
[<pymilvus.orm.index.Index object at 0x000002B107A67110>]
{'auto_id': False, 'description': 'MovieLens hybrid embeddings', 'fields': [{'name': 'movieId', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'title', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 200}}, {'name': 'hybrid_embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 468}}], 'enable_dynamic_field': False}
Collection loaded and ready for search


In [134]:
# -------------------------------
# Full pipeline: NL query -> LLM parse -> Milvus search
# -------------------------------
import google.generativeai as genai
import json
from pymilvus import connections
import numpy as np

# --- 1. Konfigurasi Google Gemini ---
genai.configure(api_key="AIzaSyBlJ5GjMulGnYoUgUnY9HIo-Cah_QdA1nQ")

# --- 2. Function untuk extract preference dari NL input ---
def extract_preferences(user_input: str):
    prompt = f"""
    Ambil informasi dari query user soal film.
    Output HARUS berupa Python dict (bukan penjelasan).
    
    Keys:
    - film_reference (string atau None)
    - genres (list of string)
    
    Contoh:
    Input: Aku pengen film kayak Avatar tapi lebih adventure
    Output: {{'film_reference': 'Avatar', 'genres': ['Adventure']}}
    
    Query: "{user_input}"
    """

    model = genai.GenerativeModel("gemini-2.5-flash")
    response = model.generate_content(prompt)

    raw = response.candidates[0].content.parts[0].text.strip()
    # print("Raw output:", raw)

    # hapus triple backticks kalau ada
    raw = re.sub(r"^```[a-zA-Z]*\n?", "", raw)
    raw = re.sub(r"```$", "", raw)

    # ganti single quote → double quote supaya valid JSON
    cleaned = raw.replace("'", '"')

    parsed = json.loads(cleaned)

    # pastikan format sesuai
    user_query = {
        "film_reference": parsed.get("film_reference"),
        "genres": parsed.get("genres", [])
    }

    return user_query

# --- 3. Simulasi input user (natural language) ---
user_input = "Aku suka film mirip Armageddon, yang Science Fiction lah genrenya"
user_query = extract_preferences(user_input)
print("User Input:", user_input)
# print("User Query Parsed:", user_query)

# --- 4. Milvus search pakai user_query ---
top_k = 5

# 4.1 Ambil embedding film reference
ref_movie = hybrid_df[hybrid_df['title'].str.contains(user_query['film_reference'], case=False, regex=True)]
if ref_movie.empty:
    raise ValueError(f"Reference movie '{user_query['film_reference']}' not found in dataset.")

query_vector = np.array(ref_movie['hybrid_embedding'].values[0], dtype=np.float32).reshape(1, -1)
ref_movie_id = ref_movie['movieId'].values[0]
# print("Query vector shape:", query_vector.shape)
query_vector = query_vector / np.linalg.norm(query_vector, axis=1, keepdims=True)

# 4.2 Run search
results = collection.search(
    data=query_vector,
    anns_field="hybrid_embedding",
    param={"metric_type": "COSINE", "params": {"ef": 64}},
    limit=top_k + 10
)

# 4.3 Genre bonus function
def genre_bonus(movie_id):
    movie_genres = hybrid_df.loc[hybrid_df['movieId']==movie_id, mlb.classes_].values[0]
    user_genres_idx = [i for i, g in enumerate(mlb.classes_) if g in user_query['genres']]
    if not user_genres_idx:
        return 0
    return movie_genres[user_genres_idx].sum() / len(user_genres_idx)

# 4.4 Adjust score & filter
final_results = []
for hits in results:
    for hit in hits:
        if hit.id == ref_movie_id:
            continue
        similarity = hit.distance
        title = hybrid_df.loc[hybrid_df['movieId']==hit.id, 'title'].values[0]
        final_results.append((title, similarity, hit.distance))

# sort similarity DESC
# final_results = sorted(final_results, key=lambda x: x[1], reverse=True)

print(f"\nTop-{top_k} recommended movies:")
for i, (title, sim, dist) in enumerate(final_results[:top_k], 1):
    print(f"{i}. {title} (similarity: {sim:.4f})")




User Input: Aku suka film mirip Armageddon, yang Science Fiction lah genrenya

Top-5 recommended movies:
1. Independence Day (a.k.a. ID4) (1996) (similarity: 0.7494)
2. Saint, The (1997) (similarity: 0.7427)
3. Twister (1996) (similarity: 0.7040)
4. Godzilla (1998) (similarity: 0.6932)
5. Speed 2: Cruise Control (1997) (similarity: 0.6826)
