In [1]:
import numpy as np
import umap
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import os
real_path = '/exp_data/sjx/star/first_data/ESM-embedding/negative_all_embedding.npy'
fake_path = '/exp_data/sjx/star/experiments/gan_anlysis/gan_data/fake_negative_embeddings_epoch100.npy'
save_dir = '/exp_data/sjx/star/experiments/gan_anlysis/picture_result/'
os.makedirs(save_dir, exist_ok=True)
max_samples = 1000  # 如需更小内存可调小
real_embeddings = np.load(real_path, mmap_mode='r')
generated_embeddings = np.load(fake_path, mmap_mode='r')

if real_embeddings.shape[0] > max_samples:
    idx = np.random.choice(real_embeddings.shape[0], max_samples, replace=False)
    real_embeddings = real_embeddings[idx]
if generated_embeddings.shape[0] > max_samples:
    idx = np.random.choice(generated_embeddings.shape[0], max_samples, replace=False)
    generated_embeddings = generated_embeddings[idx]

print("real_embeddings shape:", real_embeddings.shape)
print("generated_embeddings shape:", generated_embeddings.shape)
real_embeddings_2d = real_embeddings.mean(axis=1)
generated_embeddings_2d = generated_embeddings.mean(axis=1)
print(f"real_embeddings_2d shape: {real_embeddings_2d.shape}")
print(f"generated_embeddings_2d shape: {generated_embeddings_2d.shape}")

all_embeddings = np.vstack([real_embeddings_2d, generated_embeddings_2d])
labels = np.array([0]*len(real_embeddings_2d) + [1]*len(generated_embeddings_2d))
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=42)
embedding_2d = reducer.fit_transform(all_embeddings)

plt.figure(figsize=(8,6))
plt.scatter(embedding_2d[labels==0, 0], embedding_2d[labels==0, 1], s=10, alpha=0.6, label="Real")
plt.scatter(embedding_2d[labels==1, 0], embedding_2d[labels==1, 1], s=10, alpha=0.6, label="Fake 100")
plt.legend()
plt.title("UMAP Visualization of Real vs Fake (Epoch 100)")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.tight_layout()
pdf_path = os.path.join(save_dir, 'umap_real_vs_fake_epoch100.pdf')
plt.savefig(pdf_path, bbox_inches='tight')
plt.show()
print(f"UMAP 图已保存到: {pdf_path}")


In [1]:
import numpy as np
import os

# 路径配置
gan_dir = '/exp_data/sjx/star/experiments/gan_anlysis/data/exist_bilstm_data/'
gan_files = [
    'fc_only_fake_negative_embeddings.npy'
]
real_path = '/exp_data/sjx/star/first_data/ESM-embedding/negative_train_embedding.npy'
save_dir = '/exp_data/sjx/star/experiments/gan_anlysis/data/Train_data/'
os.makedirs(save_dir, exist_ok=True)

In [2]:
real_neg = np.load(real_path, mmap_mode='r')
print("real negative shape:", real_neg.shape)

In [3]:
for gan_file in gan_files:
    gan_path = os.path.join(gan_dir, gan_file)
    gan_arr = np.load(gan_path, mmap_mode='r')[:1948]
    print(f"{gan_file} shape (used):", gan_arr.shape)
    
    all_neg = np.concatenate([real_neg, gan_arr], axis=0)
    print("拼接后 shape:", all_neg.shape)
    
    # 生成保存文件名
    epoch = gan_file.split('epoch')[-1].split('.')[0]
    save_path = os.path.join(save_dir, f'all_negative_train_embeddings_epoch_fc.npy')
    np.save(save_path, all_neg)
    print(f"已保存到: {save_path}")