In [42]:
!pip -q install -U sentence-transformers joblib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [43]:
import os, re, json, glob
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from scipy import sparse
import matplotlib.pyplot as plt

# Optional (heatmap)
import seaborn as sns

# Embeddings
from sentence_transformers import SentenceTransformer

In [44]:
DATA_DIR = "/kaggle/input"
dataset_folder = None
for p in glob.glob(os.path.join(DATA_DIR, "*top-15000*")):
    dataset_folder = p
    break

if dataset_folder is None:
    raise FileNotFoundError("Không tìm thấy folder dataset top-15000 trong /kaggle/input")

csvs = glob.glob(os.path.join(dataset_folder, "*.csv"))
if not csvs:
    # đôi khi dataset có folder con
    csvs = glob.glob(os.path.join(dataset_folder, "**", "*.csv"), recursive=True)

if not csvs:
    raise FileNotFoundError("Không tìm thấy file .csv trong dataset folder.")

csv_path = csvs[0]
print("Using CSV:", csv_path)

df_raw = pd.read_csv(csv_path)
print(df_raw.shape)
df_raw.head(3)

Using CSV: /kaggle/input/top-15000-ranked-anime-dataset-update-to-32025/top_15000_anime.csv
(15000, 24)


Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,demographics,...,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members
0,52991,https://myanimelist.net/anime/52991/Sousou_no_...,https://cdn.myanimelist.net/images/anime/1015/...,Sousou no Frieren,Frieren: Beyond Journey's End,葬送のフリーレン,9.29,"Adventure, Drama, Fantasy",,Shounen,...,"Aniplex, Dentsu, Shogakukan-Shueisha Productio...",Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,1.0,128,76513,734207.0,1225468
1,5114,https://myanimelist.net/anime/5114/Fullmetal_A...,https://cdn.myanimelist.net/images/anime/1208/...,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,9.1,"Action, Adventure, Drama, Fantasy",Military,Shounen,...,"Aniplex, Square Enix, Mainichi Broadcasting Sy...",Bones,Manga,24 min per ep,R - 17+ (violence & profanity),2.0,3,236798,2249670.0,3577489
2,9253,https://myanimelist.net/anime/9253/Steins_Gate,https://cdn.myanimelist.net/images/anime/1935/...,Steins;Gate,Steins;Gate,STEINS;GATE,9.07,"Drama, Sci-Fi, Suspense","Psychological, Time Travel",,...,"Frontier Works, Media Factory, Kadokawa Shoten...",White Fox,Visual novel,24 min per ep,PG-13 - Teens 13 or older,3.0,14,198296,1483605.0,2737980


In [45]:
def norm_col(c: str) -> str:
    return re.sub(r"[^a-z0-9]+", "_", str(c).strip().lower())

df = df_raw.copy()
df.columns = [norm_col(c) for c in df.columns]

def pick_col(cols, candidates):
    for c in candidates:
        if c in cols:
            return c
    # fallback: find substring
    for c in cols:
        for key in candidates:
            if key in c:
                return c
    return None

cols = set(df.columns)

c_id   = pick_col(cols, ["anime_id","mal_id","id"])
c_title= pick_col(cols, ["title","name","anime_title"])
c_syn  = pick_col(cols, ["synopsis","description","sypnosis","overview"])
c_gen  = pick_col(cols, ["genres","genre"])
c_type = pick_col(cols, ["type","format"])
c_eps  = pick_col(cols, ["episodes","eps"])
c_score= pick_col(cols, ["score","rating","scored"])
c_scby = pick_col(cols, ["scored_by","score_count","users_scored","members_scored","scorers"])
c_mem  = pick_col(cols, ["members","member","favorites","favourites"])
c_pop  = pick_col(cols, ["popularity","popular"])
c_rank = pick_col(cols, ["rank","ranking"])
c_seas = pick_col(cols, ["season","premiered"])
c_stat = pick_col(cols, ["status"])
c_stu  = pick_col(cols, ["studios","studio","producer"])
c_src  = pick_col(cols, ["source"])
c_dur  = pick_col(cols, ["duration"])
c_img  = pick_col(cols, ["image_url","img_url","image","picture","poster","cover","imageurl"])
c_url  = pick_col(cols, ["url","link"])

print("Mapped columns:")
for k,v in {
    "id":c_id, "title":c_title, "synopsis":c_syn, "genres":c_gen, "type":c_type,
    "episodes":c_eps, "score":c_score, "scored_by":c_scby, "members":c_mem,
    "popularity":c_pop, "rank":c_rank,  "season":c_seas,
    "status":c_stat, "studios":c_stu, "source":c_src, "duration":c_dur,
    "image_url":c_img, "url":c_url
}.items():
    print(f"- {k:10s}: {v}")

Mapped columns:
- id        : anime_id
- title     : name
- synopsis  : synopsis
- genres    : genres
- type      : type
- episodes  : episodes
- score     : score
- scored_by : scored_by
- members   : members
- popularity: popularity
- rank      : rank
- season    : premiered
- status    : None
- studios   : studios
- source    : source
- duration  : duration
- image_url : image_url
- url       : image_url


In [46]:
print(sorted(
    set(
        g.strip()
        for s in df["genres"].dropna()
        for g in s.split(",")
    )
))

['Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love', 'Comedy', 'Drama', 'Ecchi', 'Erotica', 'Fantasy', 'Girls Love', 'Gourmet', 'Hentai', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports', 'Supernatural', 'Suspense']


In [47]:
before = len(df)
df = df[~df["genres"].str.contains("Hentai|Boys Love|Girls Love|Ecchi", case=False, na=False)]
print(f"Removed {before - len(df)} hentai items")

Removed 2440 hentai items


In [48]:
def safe_str(x):
    if pd.isna(x):
        return ""
    return str(x).strip()

def safe_num(x):
    try:
        if pd.isna(x): return np.nan
        return float(x)
    except:
        return np.nan

# Build canonical frame
out = pd.DataFrame()
out["anime_id"] = df[c_id] if c_id else np.arange(len(df))
out["title"] = df[c_title].apply(safe_str) if c_title else ""
out["synopsis"] = df[c_syn].apply(safe_str) if c_syn else ""
out["genres"] = df[c_gen].apply(safe_str) if c_gen else ""
out["type"] = df[c_type].apply(safe_str) if c_type else ""
out["episodes"] = df[c_eps].apply(safe_num) if c_eps else np.nan
out["score"] = df[c_score].apply(safe_num) if c_score else np.nan
out["scored_by"] = df[c_scby].apply(safe_num) if c_scby else np.nan
out["members"] = df[c_mem].apply(safe_num) if c_mem else np.nan
out["popularity"] = df[c_pop].apply(safe_num) if c_pop else np.nan
out["rank"] = df[c_rank].apply(safe_num) if c_rank else np.nan
out["season"] = df[c_seas].apply(safe_str) if c_seas else ""
out["status"] = df[c_stat].apply(safe_str) if c_stat else ""
out["studios"] = df[c_stu].apply(safe_str) if c_stu else ""
out["source"] = df[c_src].apply(safe_str) if c_src else ""
out["duration"] = df[c_dur].apply(safe_str) if c_dur else ""
out["image_url"] = df[c_img].apply(safe_str) if c_img else ""
out["url"] = df[c_url].apply(safe_str) if c_url else ""

# Missing values handling
out["title"] = out["title"].replace("", np.nan)
out = out.dropna(subset=["title"]).copy()

# Normalize text
for col in ["title","synopsis","genres","type","season","status","studios","source"]:
    out[col] = out[col].fillna("").astype(str).str.strip()

# Remove duplicates (title)
out = out.drop_duplicates(subset=["title"]).reset_index(drop=True)
ban = ["Hentai", "Boys Love", "Girls Love", "Ecchi"]
out = out[~out["genres"].str.contains("|".join(map(re.escape, ban)), case=False, na=False)].reset_index(drop=True)


# Outlier handling (clip numeric)
for col in ["score","scored_by","members","popularity","rank","episodes"]:
    if col in out.columns:
        x = out[col].astype(float)
        lo, hi = np.nanpercentile(x, 1), np.nanpercentile(x, 99)
        out[col] = x.clip(lo, hi)

# Score filled
out["score_filled"] = out["score"].fillna(out["score"].median())

# Weighted score (IMDB-style)
C = out["score_filled"].mean()
v = out["scored_by"].fillna(0.0)
m = np.nanpercentile(v, 90)  # threshold
R = out["score_filled"]
out["weighted_score"] = (v/(v+m))*R + (m/(v+m))*C
# Popularity value (dùng số lượng, không dùng rank)
out["pop_value"] = np.log1p(out["members"].fillna(0.0))

# Quality × Popularity (giúp top khác nhau rõ rệt)
out["quality_pop_score"] = out["score_filled"] * out["pop_value"]

# Combined text for vectorization
out["combined_text"] = (
    out["title"] + " " +
    out["synopsis"] + " " +
    out["genres"] + " " +
    out["studios"] + " " +
    out["type"] + " " +
    out["source"] + " " +
    out["season"] + " " +
    out["status"]
).str.replace(r"\s+", " ", regex=True).str.strip()

# Percentile-based normalization
out["score_pct"] = out["score_filled"].rank(pct=True)
out["pop_pct"] = out["members"].rank(pct=True)

# Balanced trade-off
out["quality_pop_pct"] = 0.5 * out["score_pct"] + 0.5 * out["pop_pct"]

out.head(3)

Unnamed: 0,anime_id,title,synopsis,genres,type,episodes,score,scored_by,members,popularity,...,image_url,url,score_filled,weighted_score,pop_value,quality_pop_score,combined_text,score_pct,pop_pct,quality_pop_pct
0,52991,Sousou no Frieren,During their decade-long quest to defeat the D...,"Adventure, Drama, Fantasy",TV,28.0,8.55,684444.12,1207507.04,132.56,...,https://cdn.myanimelist.net/images/anime/1015/...,https://cdn.myanimelist.net/images/anime/1015/...,8.55,8.339416,14.004069,119.734793,Sousou no Frieren During their decade-long que...,0.994903,0.995023,0.994963
1,5114,Fullmetal Alchemist: Brotherhood,After a horrific alchemy experiment goes wrong...,"Action, Adventure, Drama, Fantasy",TV,64.0,8.55,684444.12,1207507.04,132.56,...,https://cdn.myanimelist.net/images/anime/1208/...,https://cdn.myanimelist.net/images/anime/1208/...,8.55,8.339416,14.004069,119.734793,Fullmetal Alchemist: Brotherhood After a horri...,0.994903,0.995023,0.994963
2,9253,Steins;Gate,Eccentric scientist Rintarou Okabe has a never...,"Drama, Sci-Fi, Suspense",TV,24.0,8.55,684444.12,1207507.04,132.56,...,https://cdn.myanimelist.net/images/anime/1935/...,https://cdn.myanimelist.net/images/anime/1935/...,8.55,8.339416,14.004069,119.734793,Steins;Gate Eccentric scientist Rintarou Okabe...,0.994903,0.995023,0.994963


In [49]:
os.makedirs("artifacts/figures", exist_ok=True)

# 1) Score distribution (hist)
plt.figure()
out["score_filled"].plot(kind="hist", bins=30)
plt.title("Score Distribution")
plt.xlabel("score")
plt.ylabel("count")
plt.tight_layout()
plt.savefig("artifacts/figures/score_distribution.png", dpi=160)
plt.close()

# 2) Top genres frequency (barh)
def explode_genres(s):
    if not s:
        return []
    # nhiều dataset dùng "," hoặc "|" hoặc ";"
    s = str(s)
    if "|" in s: parts = s.split("|")
    elif ";" in s: parts = s.split(";")
    else: parts = s.split(",")
    return [g.strip() for g in parts if g.strip()]

genre_series = out["genres"].apply(explode_genres).explode()
genre_counts = genre_series.value_counts().head(20)

plt.figure()
genre_counts.sort_values().plot(kind="barh")
plt.title("Top 20 Genres Frequency")
plt.xlabel("count")
plt.tight_layout()
plt.savefig("artifacts/figures/top_genres.png", dpi=160)
plt.close()

# 3) Correlation heatmap
num_cols = [c for c in ["score_filled","scored_by","members","popularity","rank","episodes","weighted_score"] if c in out.columns]
corr = out[num_cols].corr()

plt.figure(figsize=(10, 7))
sns.heatmap(corr, annot=True, fmt=".2f", square=True, linewidths=0.5)
plt.title("Correlation Heatmap (Numeric Features)")
plt.tight_layout()
plt.savefig("artifacts/figures/correlation_heatmap.png", dpi=180)
plt.close()

print("✅ Saved EDA figures to artifacts/figures/")
print("Numeric cols used:", num_cols)

✅ Saved EDA figures to artifacts/figures/
Numeric cols used: ['score_filled', 'scored_by', 'members', 'popularity', 'rank', 'episodes', 'weighted_score']


In [57]:
# Niche gems: high quality, low popularity
niche = out[(out["score_pct"] > 0.75) & (out["pop_pct"] < 0.25)]

def explode_genres(s):
    if not s: return []
    if "|" in s: parts = s.split("|")
    elif ";" in s: parts = s.split(";")
    else: parts = s.split(",")
    return [g.strip() for g in parts if g.strip()]

genre_counts = niche["genres"].apply(explode_genres).explode().value_counts().head(10)

plt.figure(figsize=(7, 4))
genre_counts.sort_values().plot(kind="barh")
plt.title("Top Genres among Niche High-Quality Anime")
plt.xlabel("count")
plt.tight_layout()
plt.savefig("artifacts/figures/niche_genres.png", dpi=160)
plt.close()

In [56]:
# ================================
# Quality–Popularity Map
# ================================

# Percentile normalization
out["score_pct"] = out["score_filled"].rank(pct=True)
out["pop_pct"] = out["members"].rank(pct=True)

plt.figure(figsize=(8, 8))

plt.scatter(
    out["pop_pct"],
    out["score_pct"],
    alpha=0.25,
    s=10
)

# Median lines (chia 4 vùng)
plt.axhline(0.5, linestyle="--", linewidth=1)
plt.axvline(0.5, linestyle="--", linewidth=1)

# Annotate quadrants
plt.text(0.02, 0.95, "Niche Gems\n(high quality,\nlow popularity)", fontsize=10, va="top")
plt.text(0.65, 0.95, "Mainstream Hits\n(high quality,\nhigh popularity)", fontsize=10, va="top")
plt.text(0.02, 0.05, "Low Impact\n(low quality,\nlow popularity)", fontsize=10, va="bottom")
plt.text(0.65, 0.05, "Overhyped\n(low quality,\nhigh popularity)", fontsize=10, va="bottom")

plt.title("Quality–Popularity Map (Percentile-based)")
plt.xlabel("Popularity Percentile (members)")
plt.ylabel("Quality Percentile (score)")
plt.tight_layout()
plt.savefig("artifacts/figures/quality_popularity_map.png", dpi=160)
plt.close()

In [50]:
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
    min_df=2,
    max_df=0.9,
    stop_words="english"
)

tfidf_matrix = tfidf.fit_transform(out["combined_text"])
print("TFIDF shape:", tfidf_matrix.shape)

os.makedirs("artifacts", exist_ok=True)
sparse.save_npz("artifacts/tfidf_matrix.npz", tfidf_matrix)

import joblib
joblib.dump(tfidf, "artifacts/tfidf_vectorizer.joblib")

TFIDF shape: (12557, 50000)


['artifacts/tfidf_vectorizer.joblib']

In [51]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
sbert = SentenceTransformer(model_name)

# encode
emb = sbert.encode(
    out["combined_text"].tolist(),
    batch_size=128,
    show_progress_bar=True,
    normalize_embeddings=True
).astype(np.float32)

print("Embeddings:", emb.shape, emb.dtype)

np.save("artifacts/sbert_embeddings.npy", emb)

with open("artifacts/embedding_meta.json", "w", encoding="utf-8") as f:
    json.dump({"model_name": model_name, "dim": int(emb.shape[1])}, f, ensure_ascii=False, indent=2)

Batches:   0%|          | 0/99 [00:00<?, ?it/s]

Embeddings: (12557, 384) float32


In [52]:
# -------------------------
# A) Predict score (RMSE/MAE) - regression proxy
# -------------------------
feat_cols = [c for c in ["scored_by","members","popularity","rank","episodes"] if c in out.columns]
X = out[feat_cols].fillna(0.0).astype(float)
y = out["score_filled"].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reg = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=2.0, random_state=42))
])
reg.fit(X_train, y_train)
pred = reg.predict(X_test)

rmse = float(np.sqrt(mean_squared_error(y_test, pred)))
mae = float(mean_absolute_error(y_test, pred))

print("RMSE:", rmse, "MAE:", mae)

# -------------------------
# B) Ranking eval (Precision@K / Recall@K) using pseudo-relevance
#   Relevant = share >=1 genre with query item
# -------------------------
def get_genre_set(s):
    return set(explode_genres(s))

genre_sets = out["genres"].apply(get_genre_set).tolist()

def precision_recall_at_k(reco_indices, rel_set, k):
    topk = reco_indices[:k]
    hit = sum(1 for i in topk if i in rel_set)
    prec = hit / max(k, 1)
    rec = hit / max(len(rel_set), 1)
    return prec, rec

def topk_tfidf(idx, k=20):
    q = tfidf_matrix[idx]
    sims = (tfidf_matrix @ q.T).toarray().ravel()  # dot product
    sims[idx] = -1
    return np.argsort(-sims)[:k]

def topk_sbert(idx, k=20):
    q = emb[idx]
    sims = emb @ q  # cosine because normalized
    sims[idx] = -1
    return np.argsort(-sims)[:k]

# sample queries for speed
np.random.seed(42)
query_ids = np.random.choice(len(out), size=min(400, len(out)), replace=False)

Ks = [5, 10, 20]
metrics = {"rmse": rmse, "mae": mae, "precision_at_k": {}, "recall_at_k": {}}

for name, topk_fn in [("tfidf", topk_tfidf), ("sbert", topk_sbert)]:
    p = {k: [] for k in Ks}
    r = {k: [] for k in Ks}
    for qi in query_ids:
        qg = genre_sets[qi]
        if len(qg) == 0:
            continue
        rel = {j for j in range(len(out)) if j != qi and len(qg & genre_sets[j]) >= 1}
        if len(rel) == 0:
            continue
        recos = topk_fn(qi, k=max(Ks))
        for k in Ks:
            pk, rk = precision_recall_at_k(recos, rel, k)
            p[k].append(pk)
            r[k].append(rk)
    metrics["precision_at_k"][name] = {str(k): float(np.mean(p[k])) for k in Ks}
    metrics["recall_at_k"][name] = {str(k): float(np.mean(r[k])) for k in Ks}

metrics

RMSE: 0.1341143687599548 MAE: 0.07022665482168432


{'rmse': 0.1341143687599548,
 'mae': 0.07022665482168432,
 'precision_at_k': {'tfidf': {'5': 0.7444444444444444,
   '10': 0.7126984126984126,
   '20': 0.6783068783068782},
  'sbert': {'5': 0.7857142857142857,
   '10': 0.7455026455026457,
   '20': 0.7048941798941799}},
 'recall_at_k': {'tfidf': {'5': 0.0011996834967894095,
   '10': 0.0022702794586626153,
   '20': 0.00420482458636377},
  'sbert': {'5': 0.001245676375329276,
   '10': 0.0023567070558440715,
   '20': 0.004261792802560783}}}

In [53]:
# Save clean data
out.to_csv("artifacts/anime_clean.csv", index=False)

# title/id mapping
title_to_idx = {t: int(i) for i, t in enumerate(out["title"].tolist())}
id_to_idx = {str(out["anime_id"].iloc[i]): int(i) for i in range(len(out))}

with open("artifacts/title_to_idx.json", "w", encoding="utf-8") as f:
    json.dump(title_to_idx, f, ensure_ascii=False)

with open("artifacts/id_to_idx.json", "w", encoding="utf-8") as f:
    json.dump(id_to_idx, f, ensure_ascii=False)

with open("artifacts/metrics.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

print("✅ Saved artifacts/")
print(sorted(glob.glob("artifacts/*"))[:20])

✅ Saved artifacts/
['artifacts/anime_clean.csv', 'artifacts/embedding_meta.json', 'artifacts/figures', 'artifacts/id_to_idx.json', 'artifacts/metrics.json', 'artifacts/sbert_embeddings.npy', 'artifacts/tfidf_matrix.npz', 'artifacts/tfidf_vectorizer.joblib', 'artifacts/title_to_idx.json']


In [54]:
df = pd.read_csv('/kaggle/working/artifacts/anime_clean.csv')
df.columns

Index(['anime_id', 'title', 'synopsis', 'genres', 'type', 'episodes', 'score',
       'scored_by', 'members', 'popularity', 'rank', 'season', 'status',
       'studios', 'source', 'duration', 'image_url', 'url', 'score_filled',
       'weighted_score', 'pop_value', 'quality_pop_score', 'combined_text',
       'score_pct', 'pop_pct', 'quality_pop_pct'],
      dtype='object')