### Selection of 1948 negative sample data


In [None]:

import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

# ========================
# Config & Device
# ========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
select_num = 1948  


real = np.load("/exp_data/sjx/star/first_data/ESM-embedding/negative_all_embedding.npy")
fake = np.load("/exp_data/sjx/star/gan_data/fake_negative_embeddings.npy")
print("[Data loading completed]")
print("real negative samples shape:", real.shape)
print("generated negative samples shape:", fake.shape)


X = np.concatenate([real, fake], axis=0)
y = np.array([1] * len(real) + [0] * len(fake))


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train))
val_ds   = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val))
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)


class SimpleDiscriminator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.model(x)

input_dim = real.shape[1] * real.shape[2]
model = SimpleDiscriminator(input_dim=input_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.BCELoss()

# flatten input
def flatten_batch(x): return x.view(x.size(0), -1)


print("\n[Start training the discriminator]")
for epoch in range(10):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device).float().unsqueeze(1)
        pred = model(flatten_batch(xb))
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")


print("\n[Scoring the generated data and selecting the most realistic samples]")
model.eval()
fake_tensor = torch.tensor(fake, dtype=torch.float32).to(device)
with torch.no_grad():
    scores = model(flatten_batch(fake_tensor)).cpu().squeeze().numpy()


top_idx = np.argsort(scores)[-select_num:]
selected_fake = fake[top_idx]


save_path = "/exp_data/sjx/star/gan_data/selected_fake_embeddings_posthoc.npy"
np.save(save_path, selected_fake)
print(f"\n[✓] Selected by saved discriminator scores {select_num} Article generation data to the: {save_path}")
print("Save data shape:", selected_fake.shape)


In [None]:
fake = np.load("/exp_data/sjx/star/gan_data/fake_negative_embeddings.npy")
print("raw data dtype:", fake.dtype)
print("original shape:", fake.shape)

selected_fake = fake[:1948]
print("post-interception dtype:", selected_fake.dtype)
print("memory footprint（GB）:", selected_fake.nbytes / 1024 / 1024 / 1024)


In [None]:
import numpy as np


fake = np.load("/exp_data/sjx/star/gan_data/fake_negative_embeddings.npy")
print("Number of raw generated samples：", fake.shape[0])


selected_fake = fake[:1948]


save_path = "/exp_data/sjx/star/gan_data/selected_fake_embeddings_top1948.npy"
np.save(save_path, selected_fake)
print(f"[✓] The first 1948 generated samples have been saved to. {save_path}")
print("Save data shape:", selected_fake.shape)


### Combined generation of negative samples and true negative samples

In [None]:
import numpy as np


original_neg = np.load("/exp_data/sjx/star/first_data/ESM-embedding/negative_train_embedding.npy")
print("Original negative sample shape:", original_neg.shape)


gan_neg = np.load("/exp_data/sjx/star/gan_data/selected_fake_embeddings_top1948.npy")
print("GAN Generate negative samples shape:", gan_neg.shape)


combined_neg = np.concatenate([original_neg, gan_neg], axis=0)
print("Total number of negative samples after merging:", combined_neg.shape)


save_path = "/exp_data/sjx/star/gan_data/negative_train_all_combined.npy"
np.save(save_path, combined_neg)
print(f"[✓] The merge was saved successfully with the path: {save_path}")



In [None]:
import numpy as np
import os


pos_path = '/exp_data/sjx/star/first_data/ESM-embedding/positive_train_embedding.npy'
neg_path = '/exp_data/sjx/star/gan_data/negative_train_all_combined.npy'
save_dir = '/exp_data/sjx/star/first_data/ESM-embedding/all_train_data'
os.makedirs(save_dir, exist_ok=True)


pos = np.load(pos_path, mmap_mode='r')
neg = np.load(neg_path, mmap_mode='r')


X = np.concatenate([pos, neg], axis=0)

y = np.concatenate([np.ones(len(pos), dtype=np.int64), np.zeros(len(neg), dtype=np.int64)], axis=0)


idx = np.random.permutation(len(X))
X = X[idx]
y = y[idx]


np.save(os.path.join(save_dir, 'all_train_data.npy'), X)
np.save(os.path.join(save_dir, 'all_train_labels.npy'), y)
print(f'[INFO] Merger completed, features shape: {X.shape}，tab shape: {y.shape}')
print(f'[INFO] Saved to: {save_dir}/all_train_data.npy, {save_dir}/all_train_labels.npy')