In [2]:
import requests
from bs4 import BeautifulSoup
import os
from concurrent.futures import ThreadPoolExecutor

url = "https://raw.communitydragon.org/latest/plugins/rcp-be-lol-game-data/global/default/v1/profile-icons/"
save_dir = "lol_profile_icons"
os.makedirs(save_dir, exist_ok=True)

html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")

links = [
    a["href"] for a in soup.find_all("a")
    if a.has_attr("href") and (a["href"].endswith(".png") or a["href"].endswith(".jpg"))
]
print(f"找到 {len(links)} 个头像文件")

def download_file(link):
    filename = link.split("/")[-1]
    img_url = url + filename
    try:
        r = requests.get(img_url, timeout=10)
        r.raise_for_status()
        with open(os.path.join(save_dir, filename), "wb") as f:
            f.write(r.content)
        return f"下载成功 {filename}"
    except Exception as e:
        return f"失败 {filename}: {e}"

# 使用线程池并行下载（比如开 16 个线程）
with ThreadPoolExecutor(max_workers=16) as executor:
    for msg in executor.map(download_file, links):
        print(msg)


找到 4720 个头像文件
下载成功 0.jpg
下载成功 1.jpg
下载成功 10.jpg
下载成功 1000.jpg
下载成功 10001.jpg
下载成功 10002.jpg
下载成功 10003.jpg
下载成功 10004.jpg
下载成功 10005.jpg
下载成功 1001.jpg
下载成功 1002.jpg
下载成功 1003.jpg
下载成功 1004.jpg
下载成功 1005.jpg
下载成功 1006.jpg
下载成功 1007.jpg
下载成功 1008.jpg
下载成功 1009.jpg
下载成功 1010.jpg
下载成功 1011.jpg
下载成功 1012.jpg
下载成功 1013.jpg
下载成功 1014.jpg
下载成功 1015.jpg
下载成功 1016.jpg
下载成功 1017.jpg
下载成功 1018.jpg
下载成功 1019.jpg
下载成功 1020.jpg
下载成功 1021.jpg
下载成功 1022.jpg
下载成功 1023.jpg
下载成功 1024.jpg
下载成功 1025.jpg
下载成功 1026.jpg
下载成功 1027.jpg
下载成功 1028.jpg
下载成功 1029.jpg
下载成功 1030.jpg
下载成功 1031.jpg
下载成功 1032.jpg
下载成功 1033.jpg
下载成功 1034.jpg
下载成功 1035.jpg
下载成功 1036.jpg
下载成功 1037.jpg
下载成功 1038.jpg
下载成功 1039.jpg
下载成功 1040.jpg
下载成功 1041.jpg
下载成功 1042.jpg
下载成功 1043.jpg
下载成功 1044.jpg
下载成功 1045.jpg
下载成功 1046.jpg
下载成功 1047.jpg
下载成功 1048.jpg
下载成功 1049.jpg
下载成功 1051.jpg
下载成功 1052.jpg
下载成功 1053.jpg
下载成功 1054.jpg
下载成功 1055.jpg
下载成功 1056.jpg
下载成功 1057.jpg
下载成功 1058.jpg
下载成功 1059.jpg
下载成功 1060.jpg
下载成功 1061.jpg
下载成功 1062.jpg
下载成功 1063

In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [13]:
import os, csv
from IPython.display import display
from ipywidgets import widgets
from PIL import Image

img_dir = "./drive/MyDrive/lol_profile_icons"
csv_path = "icon_filted.csv"

# 读取已有标注
labels = {}
if os.path.exists(csv_path):
    with open(csv_path) as f:
        for line in f:
            fname, label = line.strip().split(",")
            labels[fname] = label

# 找到还没标的文件
files = [f for f in os.listdir(img_dir) if f not in labels]

# 打开输出文件（追加模式）
out = open(csv_path, "a", newline="")
writer = csv.writer(out)

# 当前图片索引
idx = 0
img_widget = widgets.Image()
btn_like = widgets.Button(description="喜欢 👍", button_style="success")
btn_dislike = widgets.Button(description="不喜欢 👎", button_style="danger")
btn_skip = widgets.Button(description="跳过 ⏭️")

def show_image(i):
    fname = files[i]
    with open(os.path.join(img_dir, fname), "rb") as f:
        img_widget.value = f.read()
    img_widget.format = 'png'
    img_widget.filename = fname

def save_label(label):
    global idx
    fname = files[idx]
    labels[fname] = label
    writer.writerow([fname, label])
    out.flush()
    idx += 1
    if idx < len(files):
        show_image(idx)
    else:
        print("✅ 全部标注完成！")

btn_like.on_click(lambda b: save_label("1"))
btn_dislike.on_click(lambda b: save_label("0"))
btn_skip.on_click(lambda b: save_label(""))

display(img_widget, btn_like, btn_dislike, btn_skip)
show_image(idx)


Image(value=b'')

Button(button_style='success', description='喜欢 👍', style=ButtonStyle())

Button(button_style='danger', description='不喜欢 👎', style=ButtonStyle())

Button(description='跳过 ⏭️', style=ButtonStyle())

In [18]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. 载入 CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 2. 定义 Dataset
class IconDataset(Dataset):
    def __init__(self, csv_file, img_dir, processor):
        df = pd.read_csv(csv_file, names=["fname", "label"])

        # 过滤掉 NaN 或空字符串
        df = df.dropna(subset=["label"])
        df = df[df["label"].astype(str).str.strip() != ""]

        # 转成 int
        df["label"] = df["label"].astype(int)

        self.data = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.img_dir, row["fname"])
        image = Image.open(img_path).convert("RGB")
        inputs = self.processor(images=image, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)
        label = torch.tensor(int(row["label"]), dtype=torch.long)
        return pixel_values, label

img_dir = "./drive/MyDrive/lol_profile_icons"
dataset = IconDataset("./drive/MyDrive/icon_labels.csv", img_dir, clip_processor)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
# 冻结 CLIP 参数
for p in clip_model.parameters():
    p.requires_grad = False

# 定义分类头
class Classifier(torch.nn.Module):
    def __init__(self, feature_dim, hidden_dim=256):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(feature_dim, hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, 2)  # 二分类
        )
    def forward(self, x):
        return self.net(x)

# CLIP 输出的特征维度 = 512 (vit-base-patch32)
clf = Classifier(feature_dim=512).to(device)


In [21]:
from torch.utils.data import random_split

# 划分训练/验证集
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)

EPOCHS = 5
for epoch in range(EPOCHS):
    clf.train()
    for pixel_values, labels in train_loader:
        pixel_values, labels = pixel_values.to(device), labels.to(device)

        with torch.no_grad():
            feats = clip_model.get_image_features(pixel_values)
        logits = clf(feats)

        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 验证
    clf.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for pixel_values, labels in val_loader:
            pixel_values, labels = pixel_values.to(device), labels.to(device)
            feats = clip_model.get_image_features(pixel_values)
            logits = clf(feats)
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    acc = correct / total
    print(f"Epoch {epoch+1}: val acc={acc:.4f}")


Epoch 1: val acc=0.9457
Epoch 2: val acc=0.9674
Epoch 3: val acc=0.9674
Epoch 4: val acc=0.9783
Epoch 5: val acc=0.9674


In [30]:
# 做推理


import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

img_dir = "./drive/MyDrive/lol_profile_icons"   # 你的图片目录

class InferenceDataset(Dataset):
    def __init__(self, img_dir, exts=(".png", ".jpg", ".jpeg", ".webp")):
        self.img_dir = img_dir
        self.files = sorted([f for f in os.listdir(img_dir)
                             if f.lower().endswith(exts)])
    def __len__(self):
        return len(self.files)
    def __getitem__(self, idx):
        fname = self.files[idx]
        path = os.path.join(self.img_dir, fname)
        img = Image.open(path).convert("RGB")
        inputs = clip_processor(images=img, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)  # [3, H, W]
        return pixel_values, fname  # 注意：这里返回文件名，不返回label


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
infer_ds = InferenceDataset(img_dir)
infer_loader = DataLoader(infer_ds, batch_size=32, shuffle=False)  # 保持顺序

probs_all, names_all = [], []
with torch.no_grad():
    for pixel_values, fnames in infer_loader:
        feats = clip_model.get_image_features(pixel_values.to(device))
        logits = clf(feats)  # 你之前训练好的分类头
        probs = torch.softmax(logits, dim=1)[:, 1]  # “喜欢”的概率
        probs_all.extend(probs.cpu().numpy().tolist())
        names_all.extend(list(fnames))

pred_df = pd.DataFrame({"fname": names_all, "prob_like": probs_all})
pred_df.to_csv("icon_label_predictions.csv", index=False)
print("总数：", len(pred_df))


总数： 4720


In [33]:
# 如果你有 icon_labels.csv（可能只标了部分）
labels_df = pd.read_csv("./drive/MyDrive/icon_labels.csv", names=["fname", "label"])
# 有的行可能空；先别强转 int
out_df = pred_df.merge(labels_df, on="fname", how="left")
out_df.to_csv("./drive/MyDrive/predictions_with_labels.csv", index=False)


In [37]:
import os
import matplotlib.pyplot as plt
from ipywidgets import interact, IntSlider
from PIL import Image
import pandas as pd

# 读取完整预测结果（prob_like 必须在里面）
pred_df = pd.read_csv("./drive/MyDrive/icon_label_predictions.csv")  # 或 predictions_with_labels.csv

# 按概率排序（从喜欢到不喜欢）
sorted_df = pred_df.sort_values("prob_like", ascending=False).reset_index(drop=True)

img_dir = "./drive/MyDrive/lol_profile_icons"  # 改成你的图片目录

def show_page(page=0, per_page=8):
    start = page * per_page
    end = start + per_page
    df = sorted_df.iloc[start:end]

    cols = 4
    rows = (per_page + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
    fig.suptitle(f"Page {page} (prob_like sorted)", fontsize=16)

    for ax, (_, row) in zip(axes.flat, df.iterrows()):
        img = Image.open(os.path.join(img_dir, row["fname"])).convert("RGB")
        ax.imshow(img)
        ax.axis("off")
        ax.set_title(f"p={row['prob_like']:.2f}")

    # 多余 subplot 清空
    for ax in axes.flat[len(df):]:
        ax.axis("off")
    plt.show()

# 翻页交互：最大页数 = 总数 / 每页数量
max_page = len(sorted_df) // 8
interact(show_page, page=IntSlider(min=0, max=max_page, step=1, value=0));


interactive(children=(IntSlider(value=0, description='page', max=590), IntSlider(value=8, description='per_pag…