In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"

# === 加载 tokenizer 和 model ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map="auto")

# === 读取 CSV 文件 ===
csv_path = "/gpfs/gibbs/project/yse/shared/yl2739/combined_chunks (1).csv"  # 请根据需要修改路径
df = pd.read_csv(csv_path)

# === 检查 content 列是否存在 ===
if "content" not in df.columns:
    raise ValueError("CSV 文件中没有名为 'content' 的列。请确认列名是否正确。")

# === 生成 embeddings 的函数 ===
def get_embeddings(texts, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            # 使用 CLS token 的输出作为 embedding
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
            embeddings.extend(batch_embeddings.numpy())
    return embeddings

# === 获取所有 content 的向量 ===
all_texts = df["content"].fillna("").astype(str).tolist()
embeddings = get_embeddings(all_texts)

# === 将 embeddings 添加到 DataFrame 或另存为文件 ===
import numpy as np

embedding_dim = len(embeddings[0])
embedding_columns = [f"emb_{i}" for i in range(embedding_dim)]
emb_df = pd.DataFrame(embeddings, columns=embedding_columns)

# 合并原始信息与 embedding 向量
result_df = pd.concat([df.reset_index(drop=True), emb_df], axis=1)

# === 保存结果 ===
output_path = "/gpfs/gibbs/project/yse/shared/yl2739/combined_chunks_with_qwen_embedding.csv"
result_df.to_csv(output_path, index=False)
print(f"✅ 嵌入向量保存成功：{output_path}")


Embedding: 100%|██████████| 5901/5901 [59:11<00:00,  1.66it/s]  


✅ 嵌入向量保存成功：/gpfs/gibbs/project/yse/shared/yl2739/combined_chunks_with_qwen_embedding.csv


In [3]:
import sys
print(sys.executable)

/home/yl2739/.conda/envs/myenv/bin/python


In [4]:
import accelerate
print(accelerate.__version__)


1.9.0


In [1]:
import torch
print(torch.cuda.is_available())  # 如果是 False，就确实没法用 GPU


False
