### 挑选1948条负样本数据


In [None]:

import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

# ========================
# Config & Device
# ========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
select_num = 1948  # 最终选择的生成数据数量

# ========================
# 加载数据
# ========================
real = np.load("/exp_data/sjx/star/first_data/ESM-embedding/negative_all_embedding.npy")
fake = np.load("/exp_data/sjx/star/gan_data/fake_negative_embeddings.npy")
print("[数据加载完成]")
print("真实负样本 shape:", real.shape)
print("生成负样本 shape:", fake.shape)

# 构建训练数据 (1:真实, 0:生成)
X = np.concatenate([real, fake], axis=0)
y = np.array([1] * len(real) + [0] * len(fake))

# 分训练集/验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train))
val_ds   = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val))
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# ========================
# 定义判别器
# ========================
class SimpleDiscriminator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.model(x)

input_dim = real.shape[1] * real.shape[2]
model = SimpleDiscriminator(input_dim=input_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.BCELoss()

# flatten input
def flatten_batch(x): return x.view(x.size(0), -1)

# ========================
# 训练判别器
# ========================
print("\n[开始训练判别器]")
for epoch in range(10):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device).float().unsqueeze(1)
        pred = model(flatten_batch(xb))
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")

# ========================
# 对 fake 数据打分并选择 top-N
# ========================
print("\n[对生成数据打分并挑选最真实的样本]")
model.eval()
fake_tensor = torch.tensor(fake, dtype=torch.float32).to(device)
with torch.no_grad():
    scores = model(flatten_batch(fake_tensor)).cpu().squeeze().numpy()

# 得分高的看作更真实，选择 top-N
top_idx = np.argsort(scores)[-select_num:]
selected_fake = fake[top_idx]

# ========================
# 保存并检查
# ========================
save_path = "/exp_data/sjx/star/gan_data/selected_fake_embeddings_posthoc.npy"
np.save(save_path, selected_fake)
print(f"\n[✓] 已保存判别器打分选出的 {select_num} 条生成数据到: {save_path}")
print("保存数据 shape:", selected_fake.shape)


In [2]:
fake = np.load("/exp_data/sjx/star/gan_data/fake_negative_embeddings.npy")
print("原始数据 dtype:", fake.dtype)
print("原始 shape:", fake.shape)

selected_fake = fake[:1948]
print("截取后 dtype:", selected_fake.dtype)
print("内存占用（GB）:", selected_fake.nbytes / 1024 / 1024 / 1024)


In [6]:
import numpy as np

# 加载已生成的 2435 条负样本
fake = np.load("/exp_data/sjx/star/gan_data/fake_negative_embeddings.npy")
print("原始生成样本数量：", fake.shape[0])

# 选取前 1948 条
selected_fake = fake[:1948]

# 保存
save_path = "/exp_data/sjx/star/gan_data/selected_fake_embeddings_top1948.npy"
np.save(save_path, selected_fake)
print(f"[✓] 已保存前 1948 条生成样本到: {save_path}")
print("保存数据 shape:", selected_fake.shape)


### 合并生成负样本和真实负样本

In [8]:
import numpy as np

# 加载原始负样本训练集
original_neg = np.load("/exp_data/sjx/star/first_data/ESM-embedding/negative_train_embedding.npy")
print("原始负样本 shape:", original_neg.shape)

# 加载 GAN 筛选后的 1948 条负样本（非压缩 .npy 文件）
gan_neg = np.load("/exp_data/sjx/star/gan_data/selected_fake_embeddings_top1948.npy")
print("GAN 生成负样本 shape:", gan_neg.shape)

# 合并两个数组
combined_neg = np.concatenate([original_neg, gan_neg], axis=0)
print("合并后的负样本总数:", combined_neg.shape)

# 保存为新的合并文件
save_path = "/exp_data/sjx/star/gan_data/negative_train_all_combined.npy"
np.save(save_path, combined_neg)
print(f"[✓] 合并保存成功，路径为: {save_path}")



In [1]:
import numpy as np
import os

# 路径
pos_path = '/exp_data/sjx/star/first_data/ESM-embedding/positive_train_embedding.npy'
neg_path = '/exp_data/sjx/star/gan_data/negative_train_all_combined.npy'
save_dir = '/exp_data/sjx/star/first_data/ESM-embedding/all_train_data'
os.makedirs(save_dir, exist_ok=True)

# 加载数据
pos = np.load(pos_path, mmap_mode='r')
neg = np.load(neg_path, mmap_mode='r')

# 合并特征
X = np.concatenate([pos, neg], axis=0)
# 生成标签：正样本为1，负样本为0
y = np.concatenate([np.ones(len(pos), dtype=np.int64), np.zeros(len(neg), dtype=np.int64)], axis=0)

# 打乱顺序（可选，推荐）
idx = np.random.permutation(len(X))
X = X[idx]
y = y[idx]

# 保存
np.save(os.path.join(save_dir, 'all_train_data.npy'), X)
np.save(os.path.join(save_dir, 'all_train_labels.npy'), y)
print(f'[INFO] 合并完成，特征 shape: {X.shape}，标签 shape: {y.shape}')
print(f'[INFO] 已保存到: {save_dir}/all_train_data.npy, {save_dir}/all_train_labels.npy')