In [1]:
import numpy as np
import umap
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 你的三维嵌入
real_embeddings = np.load("/exp_data/sjx/star/first_data/ESM-embedding/negative_all_embedding.npy")
generated_embeddings = np.load("/exp_data/sjx/star/gan_data/fake_negative_embeddings.npy")

# 变成二维
real_embeddings_2d = real_embeddings.mean(axis=1)
generated_embeddings_2d = generated_embeddings.mean(axis=1)

print(f"real_embeddings_2d shape: {real_embeddings_2d.shape}")
print(f"generated_embeddings_2d shape: {generated_embeddings_2d.shape}")

# 合并，制作标签
all_embeddings = np.vstack([real_embeddings_2d, generated_embeddings_2d])
labels = np.array([0]*len(real_embeddings_2d) + [1]*len(generated_embeddings_2d))

# UMAP降维可视化
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=42)
embedding_2d = reducer.fit_transform(all_embeddings)

plt.figure(figsize=(8,6))
plt.scatter(embedding_2d[labels==0, 0], embedding_2d[labels==0, 1], s=10, alpha=0.6, label="Real")
plt.scatter(embedding_2d[labels==1, 0], embedding_2d[labels==1, 1], s=10, alpha=0.6, label="Generated")
plt.legend()
plt.title("UMAP Visualization of Real vs Generated Protein Embeddings")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.show()

# 用 LogisticRegression 区分
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, all_embeddings, labels, cv=5)
print(f"Classification accuracy to distinguish real vs generated: {scores.mean():.3f} ± {scores.std():.3f}")



In [None]:
import numpy as np
import umap
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 你的三维嵌入
real_embeddings = np.load("/exp_data/sjx/star/first_data/ESM-embedding/negative_all_embedding.npy")
generated_embeddings = np.load("/exp_data/sjx/star/gan_data/260_e_fake_negative_embeddings.npy")

# 变成二维
real_embeddings_2d = real_embeddings.mean(axis=1)
generated_embeddings_2d = generated_embeddings.mean(axis=1)

print(f"real_embeddings_2d shape: {real_embeddings_2d.shape}")
print(f"generated_embeddings_2d shape: {generated_embeddings_2d.shape}")

# 合并，制作标签
all_embeddings = np.vstack([real_embeddings_2d, generated_embeddings_2d])
labels = np.array([0]*len(real_embeddings_2d) + [1]*len(generated_embeddings_2d))

# UMAP降维可视化
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=42)
embedding_2d = reducer.fit_transform(all_embeddings)

plt.figure(figsize=(8,6))
plt.scatter(embedding_2d[labels==0, 0], embedding_2d[labels==0, 1], s=10, alpha=0.6, label="Real")
plt.scatter(embedding_2d[labels==1, 0], embedding_2d[labels==1, 1], s=10, alpha=0.6, label="Generated")
plt.legend()
plt.title("UMAP Visualization of Real vs Generated Protein Embeddings")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.show()

# 用 LogisticRegression 区分
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, all_embeddings, labels, cv=5)
print(f"Classification accuracy to distinguish real vs generated: {scores.mean():.3f} ± {scores.std():.3f}")