In [1]:
import numpy as np
import tmap as tm
import pandas as pd
from faerun import Faerun
from matplotlib import pyplot as plt
import dill
from core.models import AME
from core.utils import get_label, get_seg, Peptide
from functools import partial
from tqdm.auto import trange, tqdm

In [2]:
ame = AME()

Args: {'task': 'enrich_reg', 'sub_type': 'aa', 'data_name': 'del2_reg', 'label_name': 'target', 'peptide_name': 'peptide', 'methods': 'thioether', 'max_workers': 16, 'origin_data_dir': '../data/origin_data', 'graph_data_dir': '../data/graph_data', 'prediction_dir': '../prediction', 'in_feats': 40, 'max_evals': 30, 'loop': True, 'device': 'cuda', 'rgcn_hidden_feats': [64, 64, 64, 64], 'ffn_hidden_feats': 128, 'rgcn_drop_out': 0.05, 'ffn_drop_out': 0.05, 'lr': 0.001, 'mode': 'higher', 'metric_name': 'r2', 'classification': False, 'batch_size': 2048, 'patience': 40}


In [4]:
# 根据正太分布对 del2 进行下采样
df = pd.read_csv("../data/origin_data/del2_reg.csv")

df["enrich"] = df["target"].apply(lambda x: 10**x)
df["seg"] = df["enrich"].apply(get_seg)
df["label"] = df["enrich"].apply(partial(get_label, threshold=140))

In [5]:
labels =  [] 

for i, row in tqdm(df.iterrows(), total=len(df)):
    seq = row["peptide"]
    smi = Peptide(seq, methods="thioether").smiles
    labels.append(
                str(smi)
                + "__name: "
                + str(seq)
                + f"__enrich: {row['enrich']}"
                + f"__label: {row['seg']}"
            )

  0%|          | 0/18429 [00:00<?, ?it/s]

In [6]:
# df = df.sample(frac=0.1, random_state=42)
# df = df.head(n=10000)
df.head()

Unnamed: 0,peptide,target,group,enrich,seg,label
0,HWYYVQHYGNLG,0.0,train,1.0,0,1
1,HLYYRTLYGLLG,2.860338,train,725.0,2,0
2,HVYYRHLYADLG,0.0,train,1.0,0,1
3,HKYYYMLYGHLG,1.278754,train,19.0,1,1
4,HYYYRSLYGTLG,2.498311,train,315.0,2,0


In [21]:
batch_size = 1024*5
# Assuming df.seq is a list or array of sequences that you want to process.
# And also assuming that each call to ame.get_embedding returns a fixed-size vector.

# Preallocate the list if you know the embedding size, e.g., embedding_size = 128
embedding_size = 64  # Example size, change it to your actual embedding size.
Xs = np.zeros((len(df["peptide"]), embedding_size))

for i in range(0, len(df["peptide"]), batch_size):
    batch_seqs = df["peptide"][i:i+batch_size]
    embeddings = ame.get_embedding(batch_seqs)
    Xs[i:i+batch_size] = embeddings

# Convert list of arrays into a 3D numpy array
# If each embedding is 2D, then X will be a 3D array
X = np.array(Xs)

  0%|          | 0/5120 [00:00<?, ?it/s]



  0%|          | 0/5120 [00:00<?, ?it/s]

  0%|          | 0/5120 [00:00<?, ?it/s]

  0%|          | 0/3069 [00:00<?, ?it/s]

In [22]:
with open("../tmp/tmap-del2_reg.pkl", "wb+") as f:
    dill.dump(X, f)

In [2]:
with open("../tmp/tmap-del2_reg.pkl", "rb+") as f:
    X = dill.load(f)

In [8]:
def prepare_data(X, df):
    """Prepare data for further operations."""
    fps = [tm.VectorFloat(x.tolist()) for x in X.astype(np.int32)]
    data = [tm.VectorFloat(x) for x in X]
    data = enc.batch_from_weight_array(data)
    return fps, data

def configure_layout(cfg):
    """Configure the layout."""
    cfg.node_size = 1 / 20
    cfg.mmm_repeats = 2
    cfg.sl_extra_scaling_steps = 5
    cfg.k = 36
    cfg.sl_scaling_type = tm.RelativeToAvgLength

# Initialize encoders and structures
enc = tm.Minhash(64)
lf = tm.LSHForest(64, 64)

# Prepare data
fps, data = prepare_data(X, df)

# Update LSH Forest
lf.batch_add(data)
lf.index()

# Configure layout
cfg = tm.LayoutConfiguration()
configure_layout(cfg)
x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

# Adjust colormap
tab_10 = plt.cm.get_cmap("tab10")

# Create categories
type_labels, type_data = Faerun.create_categories(df["label"])
genera_labels, genera_data = Faerun.create_categories(df["seg"])

# Plotting
f = Faerun(view="front", coords=False, clear_color="#FFFFFF")
f.add_scatter(
    "np_atlas",
    {
        "x": x,
        "y": y,
        "c": [type_data, genera_data, df["enrich"].tolist()],
        "labels": labels,
    },
    shader="smoothCircle",
    point_scale=2.0,
    max_point_size=20,
    legend_labels=[type_labels, genera_labels],
    categorical=[True, True, False],
    colormap=[tab_10, tab_10, "rainbow"],
    series_title=["Label", "Seg", "IC 50"],
    has_legend=True,
)
f.add_tree("peptide_atlas_tree", {"from": s, "to": t}, point_helper="np_atlas")
f.plot(template="smiles")

  tab_10 = plt.cm.get_cmap("tab10")
