In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

# ===========================
# 1. Load Dataset
# ===========================
df = pd.read_excel("pos_charge_gravy.xlsx")  # Ensure column "Sequence" exists
sequences = df["Sequence"].astype(str).tolist()
existing_seqs = set(sequences)  # for fast lookup

# ===========================
# 2. Encode Amino Acids
# ===========================
amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # 20 standard aa
aa_to_int = {aa: i+1 for i, aa in enumerate(amino_acids)}  # 0 reserved for padding
int_to_aa = {i+1: aa for i, aa in enumerate(amino_acids)}

# Encode sequences as integers
encoded = [[aa_to_int[aa] for aa in seq if aa in aa_to_int] for seq in sequences]

# ===========================
# 3. Prepare Training Data (Fixed Window Length = 20)
# ===========================
max_length = 20
X, y = [], []

for seq in encoded:
    for i in range(1, len(seq)):
        X.append(seq[:i])
        y.append(seq[i])

X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_length, padding="post")
y = tf.keras.utils.to_categorical(y, num_classes=len(amino_acids)+1)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
print(f"Training samples: {X_train.shape}, Validation samples: {X_val.shape}")

# ===========================
# 4. Define LSTM Generative Model
# ===========================
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(amino_acids)+1, output_dim=64, input_length=max_length),
    tf.keras.layers.LSTM(128, return_sequences=False),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(len(amino_acids)+1, activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
model.summary()

# ===========================
# 5. Train Model with Early Stopping
# ===========================
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

# ===========================
# 6. Peptide Generation Functions
# ===========================
def sample_with_temperature(preds, temperature=1.0):
    """Sample next amino acid index with temperature scaling."""
    preds = np.asarray(preds).astype("float64")
    preds[0] = 0  # prevent padding token from being sampled
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

def generate_peptide(max_len=20, temperature=1.0):
    first_residue = np.random.choice(list(aa_to_int.values()))
    seq = [first_residue]
    
    for _ in range(max_len - 1):
        padded = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=max_length, padding="post")
        pred = model.predict(padded, verbose=0)[0]
        next_aa = sample_with_temperature(pred, temperature)
        seq.append(next_aa)
        if next_aa == 0:  # stop if padding token sampled
            break
    
    return "".join(int_to_aa[i] for i in seq if i in int_to_aa)

# ===========================
# 7. Generate 3000 Unique Peptides
# ===========================
generated_sequences = set()  # use set to ensure uniqueness

while len(generated_sequences) < 3000:
    peptide = generate_peptide(max_len=20, temperature=1.0)
    if len(peptide) >= 10 and peptide not in existing_seqs:
        generated_sequences.add(peptide)

# Save to CSV
df_gen = pd.DataFrame({"sequence": sorted(generated_sequences)})
output_file = "generated_peptides_20.csv"
df_gen.to_csv(output_file, index=False)

print(f"Generated {len(generated_sequences)} unique peptides saved to '{output_file}'")

In [5]:
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import keras_tuner as kt
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import (
    roc_auc_score, roc_curve, precision_score, recall_score,
    accuracy_score, f1_score, confusion_matrix, precision_recall_curve, auc
)
import matplotlib.pyplot as plt
import joblib
import os

# ============================
# 1. Define datasets
# ============================
positive_files = [
    "pos_1 filtered_gravy.xlsx",
    "pos_9 filtered_charge.xlsx",
    "pos_filtered_charge_gravy.xlsx"
]

negative_files = [
    "neg1_charge_gravy.xlsx",
    "neg2_charge_gravy.xlsx"
]

# ============================
# 2. Encoding setup
# ============================
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
aa_to_int = {aa: i+1 for i, aa in enumerate(amino_acids)}  # 0 = padding
num_tokens = len(amino_acids) + 1
max_length = 30

def encode_sequence(seq):
    return [aa_to_int[aa] for aa in seq if aa in aa_to_int]

def preprocess_sequences(sequences):
    encoded = [encode_sequence(seq) for seq in sequences]
    padded = tf.keras.preprocessing.sequence.pad_sequences(encoded,
                                                           maxlen=max_length,
                                                           padding="post")
    return padded

# ============================
# 3. Augmentation function
# ============================
similar_groups = [
    ['A','V','L','I'],   # aliphatic
    ['F','W','Y'],       # aromatic
    ['K','R','H'],       # positive
    ['D','E'],           # negative
    ['S','T','N','Q']    # polar
]

def mutate_seq(seq, p_mut=0.08):
    seq = list(seq)
    for i, aa in enumerate(seq):
        if random.random() < p_mut:
            for g in similar_groups:
                if aa in g:
                    choices = [x for x in g if x != aa]
                    if choices:
                        seq[i] = random.choice(choices)
                    break
    return "".join(seq)

# ============================
# 4. Attention Layer
# ============================
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                                 initializer="random_normal", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1],1),
                                 initializer="zeros", trainable=True)
        super().build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        return tf.keras.backend.sum(x * a, axis=1)

# ============================
# 5. Model builder
# ============================
def build_model(hp):
    inputs = tf.keras.layers.Input(shape=(max_length, num_tokens))
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(
            units=hp.Int("units", 64, 128, step=32),
            return_sequences=True,
            dropout=hp.Float("dropout", 0.2, 0.4, step=0.1),
            recurrent_dropout=0.2
        )
    )(inputs)
    x = Attention()(x)
    x = tf.keras.layers.Dropout(hp.Float("att_dropout",0.2,0.4,step=0.1))(x)
    x = tf.keras.layers.Dense(
        hp.Int("dense_units", 32, 64, step=32), activation="relu"
    )(x)
    x = tf.keras.layers.Dropout(hp.Float("dense_dropout",0.2,0.4,step=0.1))(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(
            learning_rate=hp.Choice("lr",[0.001,0.0005])
        ),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model

# ============================
# 6. Training loop (all 6 classifiers)
# ============================
for pi, pos_file in enumerate(positive_files, start=1):
    for ni, neg_file in enumerate(negative_files, start=1):
        
        print(f"\n[INFO] Training classifier P{pi}+N{ni} ...")

        # --- Set unique seeds per classifier
        base_seed = 1000 + pi*10 + ni
        tf.random.set_seed(base_seed)
        np.random.seed(base_seed)
        random.seed(base_seed)

        # --- Load datasets
        df_pos = pd.read_excel(pos_file)
        df_neg = pd.read_excel(neg_file)

        df_pos["label"] = 1
        df_neg["label"] = 0

        df_all = pd.concat([df_pos[["Sequence","label"]],
                            df_neg[["Sequence","label"]]], ignore_index=True)

        # --- Train-test split with unique seed
        X = df_all["Sequence"].values
        y = df_all["label"].values
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=base_seed, shuffle=True
        )

        # --- Augment positives
        random.seed(base_seed)
        augmented_pos = []
        for seq in X_train[y_train==1]:
            for _ in range(2):
                augmented_pos.append(mutate_seq(seq))

        X_train_aug = np.concatenate([X_train, augmented_pos])
        y_train_aug = np.concatenate([y_train, np.ones(len(augmented_pos))])

        # --- Encode sequences
        X_train_enc = preprocess_sequences(X_train_aug)
        X_test_enc = preprocess_sequences(X_test)
        X_train_oh = tf.keras.utils.to_categorical(X_train_enc, num_classes=num_tokens)
        X_test_oh = tf.keras.utils.to_categorical(X_test_enc, num_classes=num_tokens)

        # --- Class weights
        cw = class_weight.compute_class_weight(
            "balanced", classes=np.unique(y_train_aug), y=y_train_aug
        )
        class_weights = dict(zip(np.unique(y_train_aug), cw))

        # --- Hyperparameter tuning
        tuner = kt.BayesianOptimization(
            build_model,
            objective="val_accuracy",
            max_trials=10,
            seed=base_seed,
            directory="tuner_results",
            project_name=f"classifier_P{pi}_N{ni}"
        )

        early_stop = tf.keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=5, restore_best_weights=True
        )

        tuner.search(
            X_train_oh, y_train_aug,
            validation_split=0.2,
            epochs=30,
            batch_size=32,
            class_weight=class_weights,
            callbacks=[early_stop],
            verbose=1
        )

        # --- Retrieve best model
        best_model = tuner.get_best_models(num_models=1)[0]

        # --- Evaluate
        y_pred_probs = best_model.predict(X_test_oh).ravel()
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
        pr, rc, _ = precision_recall_curve(y_test, y_pred_probs)
        auc_score = roc_auc_score(y_test, y_pred_probs)
        pr_auc = auc(rc, pr)
        optimal_threshold = thresholds[np.argmax(tpr - fpr)]
        y_pred = (y_pred_probs >= optimal_threshold).astype(int)
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()

        final_results = {
            "AUC": auc_score,
            "PR-AUC": pr_auc,
            "Optimal Threshold": float(optimal_threshold),
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1": f1_score(y_test, y_pred),
            "Sensitivity": tp / (tp + fn),
            "Specificity": tn / (tn + fp),
            "Confusion Matrix": cm.tolist()
        }

        # --- Save results & model
        base = f"new_model_P{pi}_N{ni}"
        best_model.save(base + ".h5")
        joblib.dump(optimal_threshold, base + "_threshold.pkl")
        pd.DataFrame([final_results]).to_csv(base + "_results.csv", index=False)

        # --- Save ROC curve
        plt.figure()
        plt.plot(fpr, tpr, label=f"AUC={auc_score:.3f}")
        plt.plot([0,1],[0,1],"k--")
        plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC Curve P{pi}+N{ni}")
        plt.legend()
        plt.savefig(base + "_roc.png", dpi=300)
        plt.close()

        # --- Save PR curve
        plt.figure()
        plt.plot(rc, pr, label=f"PR-AUC={pr_auc:.3f}")
        plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"PR Curve P{pi}+N{ni}")
        plt.legend()
        plt.savefig(base + "_pr.png", dpi=300)
        plt.close()

        print(f"[DONE] Saved model and results for classifier P{pi}+N{ni}")


[INFO] Training classifier P1+N1 ...
Reloading Tuner from tuner_results\classifier_P1_N1\tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))


[1m6/6[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 128ms/step




[DONE] Saved model and results for classifier P1+N1

[INFO] Training classifier P1+N2 ...
Reloading Tuner from tuner_results\classifier_P1_N2\tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))


[1m4/4[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 214ms/step




[DONE] Saved model and results for classifier P1+N2

[INFO] Training classifier P2+N1 ...
Reloading Tuner from tuner_results\classifier_P2_N1\tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))


[1m6/6[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 118ms/step




[DONE] Saved model and results for classifier P2+N1

[INFO] Training classifier P2+N2 ...
Reloading Tuner from tuner_results\classifier_P2_N2\tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))


[1m5/5[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 177ms/step




[DONE] Saved model and results for classifier P2+N2

[INFO] Training classifier P3+N1 ...
Reloading Tuner from tuner_results\classifier_P3_N1\tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))


[1m5/5[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 155ms/step




[DONE] Saved model and results for classifier P3+N1

[INFO] Training classifier P3+N2 ...
Reloading Tuner from tuner_results\classifier_P3_N2\tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))


[1m4/4[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 192ms/step




[DONE] Saved model and results for classifier P3+N2


In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os

# ============================
# 1. Define Custom Attention Layer (must match training code!)
# ============================
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(
            name="att_weight",
            shape=(input_shape[-1], 1),
            initializer="random_normal",
            trainable=True,
        )
        self.b = self.add_weight(
            name="att_bias",
            shape=(input_shape[1], 1),
            initializer="zeros",
            trainable=True,
        )
        super().build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        return tf.keras.backend.sum(x * a, axis=1)


# ============================
# 2. Optimal thresholds per model (STRICT)
# ============================
optimal_thresholds = {
    1: 0.8764217495918274,  # Model 1
    2: 0.7175973653793335,  # Model 2
    3: 0.8674263954162598,  # Model 3
    4: 0.8103358745574951,  # Model 4
    5: 0.36742377281188965,  # Model 5
    6: 0.7637473940849304,  # Model 6
}


# ============================
# 3. Load Generated Peptides
# ============================
gen_file = "generated_peptides_20.csv"
if not os.path.exists(gen_file):
    raise FileNotFoundError(f"Error: Input file '{gen_file}' not found.")

df_gen = pd.read_csv(gen_file)

# Normalize column names
df_gen.columns = df_gen.columns.str.strip()

if "Sequence" in df_gen.columns:
    seq_col = "Sequence"
elif "sequence" in df_gen.columns:
    seq_col = "sequence"
else:
    raise KeyError(
        "Error: The CSV must contain a column named 'Sequence' or 'sequence'."
    )

sequences = df_gen[seq_col].astype(str).str.strip().tolist()
print(f"Loaded {len(sequences)} generated peptides from '{gen_file}'.")



# ============================
# 4. Encoding and Preprocessing
# ============================
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
aa_to_int = {aa: i + 1 for i, aa in enumerate(amino_acids)}  # 0 for padding
num_tokens = len(amino_acids) + 1
max_length = 30

def encode_sequence(seq):
    return [aa_to_int[aa] for aa in seq if aa in aa_to_int]

def preprocess_sequences(seq_list):
    encoded = [encode_sequence(seq) for seq in seq_list]
    padded = tf.keras.preprocessing.sequence.pad_sequences(
        encoded,
        maxlen=max_length,
        padding="post",
        truncating="post",
    )
    one_hot = tf.keras.utils.to_categorical(
        padded, num_classes=num_tokens
    )
    return one_hot

print(f"Preprocessing sequences with max_length={max_length}...")
X_gen = preprocess_sequences(sequences)
print(f"Preprocessing complete. Data shape: {X_gen.shape}")


# ============================
# 5. Load Models and Classify
# ============================
model_files = [
    "new_model_P1_N1.h5",
    "new_model_P1_N2.h5",
    "new_model_P2_N1.h5",
    "new_model_P2_N2.h5",
    "new_model_P3_N1.h5",
    "new_model_P3_N2.h5",
]

results_summary = {}

print("\n--- Starting Classification ---")

for i, model_file in enumerate(model_files, start=1):
    if not os.path.exists(model_file):
        print(f"WARNING: '{model_file}' not found. Skipping.")
        continue

    if i not in optimal_thresholds:
        raise ValueError(f"No optimal threshold defined for Model {i}")

    threshold = optimal_thresholds[i]

    print(f"\nLoading Model {i} from '{model_file}'...")
    model = tf.keras.models.load_model(
        model_file, custom_objects={"Attention": Attention}
    )

    print(f"Predicting with Model {i} (threshold = {threshold})...")
    y_pred_probs = model.predict(X_gen, verbose=0).ravel()

    y_pred = (y_pred_probs >= threshold).astype(int)

    amp_count = int(y_pred.sum())
    total_count = len(y_pred)
    perc_amp = 100 * amp_count / total_count

    results_summary[f"Model {i}"] = {
        "Model File": model_file,
        "AMP %": perc_amp,
        "AMP Count": amp_count,
        "Total Peptides": total_count,
        "Threshold Used": threshold,
    }

    # Save per-model predictions
    df_out = df_gen.copy()
    df_out["probability"] = y_pred_probs
    df_out["prediction"] = y_pred

    per_model_file = f"thres new_20_results_model{i}.csv"
    df_out.to_csv(per_model_file, index=False)

    print(
        f"  -> Model {i}: {amp_count}/{total_count} "
        f"({perc_amp:.2f}%) classified as AMP"
    )
    print(f"  -> Results saved to '{per_model_file}'")


# ============================
# 6. Save Summary
# ============================
if results_summary:
    df_summary = (
        pd.DataFrame(results_summary)
        .T.sort_values("AMP %", ascending=False)
    )

    summary_file = "thres new_20_classification_summary.csv"
    df_summary.to_csv(summary_file, index=True)

    print(f"\nSummary saved to '{summary_file}'")
    print(df_summary)
else:
    print("\nNo models were processed. No summary created.")

Loaded 3000 generated peptides from 'generated_peptides_20.csv'.
Preprocessing sequences with max_length=30...
Preprocessing complete. Data shape: (3000, 30, 21)

--- Starting Classification ---

Loading Model 1 from 'new_model_P1_N1.h5'...




Predicting with Model 1 (threshold = 0.8764217495918274)...




  -> Model 1: 1717/3000 (57.23%) classified as AMP
  -> Results saved to 'thres new_20_results_model1.csv'

Loading Model 2 from 'new_model_P1_N2.h5'...
Predicting with Model 2 (threshold = 0.7175973653793335)...




  -> Model 2: 1848/3000 (61.60%) classified as AMP
  -> Results saved to 'thres new_20_results_model2.csv'

Loading Model 3 from 'new_model_P2_N1.h5'...
Predicting with Model 3 (threshold = 0.8674263954162598)...




  -> Model 3: 1984/3000 (66.13%) classified as AMP
  -> Results saved to 'thres new_20_results_model3.csv'

Loading Model 4 from 'new_model_P2_N2.h5'...
Predicting with Model 4 (threshold = 0.8103358745574951)...




  -> Model 4: 2019/3000 (67.30%) classified as AMP
  -> Results saved to 'thres new_20_results_model4.csv'

Loading Model 5 from 'new_model_P3_N1.h5'...
Predicting with Model 5 (threshold = 0.36742377281188965)...




  -> Model 5: 2541/3000 (84.70%) classified as AMP
  -> Results saved to 'thres new_20_results_model5.csv'

Loading Model 6 from 'new_model_P3_N2.h5'...
Predicting with Model 6 (threshold = 0.7637473940849304)...
  -> Model 6: 2016/3000 (67.20%) classified as AMP
  -> Results saved to 'thres new_20_results_model6.csv'

Summary saved to 'thres new_20_classification_summary.csv'
                 Model File      AMP % AMP Count Total Peptides Threshold Used
Model 5  new_model_P3_N1.h5       84.7      2541           3000       0.367424
Model 4  new_model_P2_N2.h5       67.3      2019           3000       0.810336
Model 6  new_model_P3_N2.h5       67.2      2016           3000       0.763747
Model 3  new_model_P2_N1.h5  66.133333      1984           3000       0.867426
Model 2  new_model_P1_N2.h5       61.6      1848           3000       0.717597
Model 1  new_model_P1_N1.h5  57.233333      1717           3000       0.876422


In [7]:
import pandas as pd

# Step 1: Read the CSV file
df = pd.read_csv('thres new_20_results_model5.csv')

print(f"Total peptides loaded: {len(df)}")
print(f"Columns: {df.columns.tolist()}")

# Step 2: Filter for prediction = 1 only
predicted_active = df[df['prediction'] == 1].copy()

print(f"\nPeptides with prediction = 1: {len(predicted_active)}")
print(f"Peptides with prediction = 0: {len(df[df['prediction'] == 0])}")

# Step 3: Display the filtered data
print("\nFiltered peptides (prediction = 1):")
print(predicted_active.head(10))

# Step 4: Save to Excel file
output_file = 'Model5_Predicted_Active_Peptides.xlsx'
predicted_active.to_excel(output_file, index=False, sheet_name='Active Peptides')

print(f"\n‚úÖ Excel file saved: {output_file}")
print(f"Total active peptides in Excel: {len(predicted_active)}")

Total peptides loaded: 3000
Columns: ['Sequence', 'probability', 'prediction']

Peptides with prediction = 1: 2541
Peptides with prediction = 0: 459

Filtered peptides (prediction = 1):
                Sequence  probability  prediction
2   ADYWRIAKELRRYIRKVGRI     0.994740           1
3   AECSGCICYWRRCRCCQVIK     0.948737           1
5   AGAEEKIRQKLKNEIKKGRK     0.995973           1
6   AGAKRIFNARRLKKIQEGKI     0.996008           1
7   AGANRLTKELLEYLRKFGKI     0.984361           1
8   AGANRLTKELLEYLRKFKKI     0.991034           1
9   AGANRLTWELLKEYLRKRKK     0.992804           1
10  AGANRLWLYLKEYLRKRGKK     0.993059           1
11  AGAQRIWKELRRYIRKVGRI     0.995904           1
12  AGAQRLKKKELYLRKRKGKI     0.997817           1

‚úÖ Excel file saved: Model5_Predicted_Active_Peptides.xlsx
Total active peptides in Excel: 2541


In [8]:
import pandas as pd
import numpy as np

print("=== STEP 2: Precision Filter with Models 4 & 6 ===")

# Step 1: Load Model 5 active peptides (EXCEL file)
df_model5 = pd.read_excel('Model5_Predicted_Active_Peptides.xlsx')
print(f"Model 5 actives loaded: {len(df_model5)} peptides")
print(f"Model 5 columns: {df_model5.columns.tolist()}")

# Step 2: Load Model 4 and Model 6 prediction files (CSV)
df_model4 = pd.read_csv('thres new_20_results_model4.csv')
df_model6 = pd.read_csv('thres new_20_results_model6.csv')

print(f"Model 4 total: {len(df_model4)} peptides")
print(f"Model 6 total: {len(df_model6)} peptides")

# Step 3: Merge all three datasets on 'Sequence' column
df_merged = df_model5.merge(
    df_model4[['Sequence', 'probability']], 
    left_on='Sequence', 
    right_on='Sequence', 
    suffixes=('', '_model4')
)

df_merged = df_merged.merge(
    df_model6[['Sequence', 'probability']], 
    left_on='Sequence', 
    right_on='Sequence', 
    suffixes=('', '_model6')
)

# Rename probability columns for clarity
df_merged = df_merged.rename(columns={
    'probability_model4': 'model4_prob',
    'probability_model6': 'model6_prob'
})

print(f"Successfully merged: {len(df_merged)} peptides (in all 3 models)")

# Step 4: Calculate Models 4+6 mean probability
df_merged['models46_mean'] = (df_merged['model4_prob'] + df_merged['model6_prob']) / 2

# Step 5: Filter where Models 4+6 mean > 0.70 (high confidence)
step2_final = df_merged[df_merged['models46_mean'] > 0.70].copy()
step2_final = step2_final.sort_values('models46_mean', ascending=False)

print(f"\n‚úÖ STEP 2 COMPLETE!")
print(f"High-confidence candidates: {len(step2_final)} peptides")
print(f"Avg Model 4 prob: {step2_final['model4_prob'].mean():.3f}")
print(f"Avg Model 6 prob: {step2_final['model6_prob'].mean():.3f}")
print(f"Avg Models 4+6 mean: {step2_final['models46_mean'].mean():.3f}")

# Step 6: Save results to Excel
output_columns = ['Sequence', 'prediction', 'probability', 'model4_prob', 'model6_prob', 'models46_mean']
step2_final[output_columns].to_excel('Step2_Models4_6_HighConf.xlsx', index=False)

print(f"\nüìä Excel saved: Step2_Models4_6_HighConf.xlsx")
print("\nTop 5 high-confidence candidates:")
print(step2_final[['Sequence', 'model4_prob', 'model6_prob', 'models46_mean']].head())

print("\nüéØ NEXT: Step 3 - Weighted Ensemble Ranking!")
print("Use: Step2_Models4_6_HighConf.xlsx")


=== STEP 2: Precision Filter with Models 4 & 6 ===
Model 5 actives loaded: 2541 peptides
Model 5 columns: ['Sequence', 'probability', 'prediction']
Model 4 total: 3000 peptides
Model 6 total: 3000 peptides
Successfully merged: 2541 peptides (in all 3 models)

‚úÖ STEP 2 COMPLETE!
High-confidence candidates: 2049 peptides
Avg Model 4 prob: 0.950
Avg Model 6 prob: 0.937
Avg Models 4+6 mean: 0.944

üìä Excel saved: Step2_Models4_6_HighConf.xlsx

Top 5 high-confidence candidates:
                  Sequence  model4_prob  model6_prob  models46_mean
1787  RKGKYYLFKKYIKKWKWKRK     0.999676     0.999433       0.999555
1500  NRRQRWWKKLKKYIKKKWRK     0.999604     0.999498       0.999551
1923  RWKKYIQLKKWYIKKKKGRK     0.999637     0.999414       0.999525
2537  YWKKYQYLKKWWWEKKKGRK     0.999556     0.999482       0.999519
1012  KKYRYYMKLWKIKKKKKGRK     0.999638     0.999361       0.999499

üéØ NEXT: Step 3 - Weighted Ensemble Ranking!
Use: Step2_Models4_6_HighConf.xlsx


In [9]:
import pandas as pd
import numpy as np

print("=== STEP 3: Weighted Ensemble Ranking ===")

# Load Step 2 results
df_step2 = pd.read_excel('Step2_Models4_6_HighConf.xlsx')
print(f"Step 2 high-confidence: {len(df_step2)} peptides")

# Calculate weighted ensemble score
# Weights: 0.4√óModel4 + 0.4√óModel5 + 0.2√óModel6 (performance-based)
df_step2['ensemble_score'] = (
    0.4 * df_step2['model4_prob'] + 
    0.4 * df_step2['probability'] +  # Model 5 probability
    0.2 * df_step2['model6_prob']
)

# Filter ensemble_score > 0.65 and take top 200
step3_final = df_step2[df_step2['ensemble_score'] > 0.65].copy()
step3_final = step3_final.sort_values('ensemble_score', ascending=False).head(200)

print(f"\n‚úÖ STEP 3 COMPLETE!")
print(f"Top 200 ensemble candidates: {len(step3_final)} peptides")
print(f"Ensemble score range: {step3_final['ensemble_score'].min():.3f} - {step3_final['ensemble_score'].max():.3f}")
print(f"Avg ensemble score: {step3_final['ensemble_score'].mean():.3f}")

# Save Step 3 results
step3_final[['Sequence', 'probability', 'model4_prob', 'model6_prob', 
             'models46_mean', 'ensemble_score']].to_excel(
    'Step3_WeightedEnsemble_Top200.xlsx', index=False
)

print(f"\nüìä Excel saved: Step3_WeightedEnsemble_Top200.xlsx")
print("\nüèÜ TOP 5 ENSEMBLE CANDIDATES:")
print(step3_final[['Sequence', 'model4_prob', 'model6_prob', 'ensemble_score']].head())

print("\nüéØ NEXT: Step 4 - Synthesis Filters!")
print("Use: Step3_WeightedEnsemble_Top200.xlsx")

=== STEP 3: Weighted Ensemble Ranking ===
Step 2 high-confidence: 2049 peptides

‚úÖ STEP 3 COMPLETE!
Top 200 ensemble candidates: 200 peptides
Ensemble score range: 0.998 - 0.999
Avg ensemble score: 0.999

üìä Excel saved: Step3_WeightedEnsemble_Top200.xlsx

üèÜ TOP 5 ENSEMBLE CANDIDATES:
               Sequence  model4_prob  model6_prob  ensemble_score
2  RWKKYIQLKKWYIKKKKGRK     0.999637     0.999414        0.999483
4  KKYRYYMKLWKIKKKKKGRK     0.999638     0.999361        0.999453
0  RKGKYYLFKKYIKKWKWKRK     0.999676     0.999433        0.999446
1  NRRQRWWKKLKKYIKKKWRK     0.999604     0.999498        0.999431
9  HRKRWKIWRFLNKKKAKKIK     0.999626     0.999060        0.999375

üéØ NEXT: Step 4 - Synthesis Filters!
Use: Step3_WeightedEnsemble_Top200.xlsx


In [10]:
import pandas as pd
import numpy as np

print("=== CALCULATE PEPTIDE PROPERTIES ===")

# Load Step 3 results
df = pd.read_excel('Step3_WeightedEnsemble_Top200.xlsx')
print(f"Loaded: {len(df)} peptides")

# Amino acid hydrophobicity values for GRAVY (Kyte-Doolittle scale)
hydrophobicity = {
    'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
    'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
}

# Function to calculate NET CHARGE (simple count at neutral pH)
def calculate_net_charge(seq):
    """Net charge: +1 for R/K/H, -1 for D/E"""
    seq = seq.upper()
    positive = seq.count('R') + seq.count('K') + seq.count('H')
    negative = seq.count('D') + seq.count('E')
    return positive - negative

# Function to calculate GRAVY (Grand Average of Hydropathicity)
def calculate_gravy(seq):
    """GRAVY = average hydrophobicity per residue"""
    seq = seq.upper()
    n = len(seq)
    if n == 0:
        return 0
    gravy = sum(hydrophobicity.get(aa, 0) for aa in seq) / n
    return gravy

# Function to calculate HYDROPHOBIC MOMENT (MuH) - simplified Eisenberg
def calculate_hydrophobic_moment(seq):
    """Simplified hydrophobic moment for amphipathicity"""
    seq = seq.upper()
    n = len(seq)
    if n == 0:
        return 0
    
    # Angles (radians) for hydrophobic moment calculation
    angle = np.pi * 100 / 180  # 100¬∞ angle between hydrophobic/hydrophilic
    
    sum_x = sum_y = 0
    for i, aa in enumerate(seq):
        h = hydrophobicity.get(aa, 0)
        theta = i * angle
        sum_x += h * np.cos(theta)
        sum_y += h * np.sin(theta)
    
    muh = np.sqrt(sum_x**2 + sum_y**2) / n
    return muh

# Calculate ALL properties
print("Calculating properties...")
df['length'] = df['Sequence'].str.len()
df['net_charge'] = df['Sequence'].apply(calculate_net_charge)
df['gravy'] = df['Sequence'].apply(calculate_gravy)
df['hydrophobic_moment'] = df['Sequence'].apply(calculate_hydrophobic_moment)

# Display summary statistics
print("\nüìä PROPERTY SUMMARY:")
print(df[['net_charge', 'gravy', 'hydrophobic_moment', 'length']].describe())

print("\nüî¨ TOP 5 PEPTIDES WITH PROPERTIES:")
display_cols = ['Sequence', 'net_charge', 'gravy', 'hydrophobic_moment', 'length', 'ensemble_score']
print(df[display_cols].head())

# Save enhanced dataset with properties
output_file = 'Step3_Top200_WithProperties.xlsx'
df.to_excel(output_file, index=False)

print(f"\n‚úÖ SAVED: Step3_Top200_WithProperties.xlsx")
print(f"All 200 peptides now have: net_charge, gravy, hydrophobic_moment, length")

print("\nüéØ READY FOR STEP 4 SYNTHESIS FILTERS!")
print("Use: Step3_Top200_WithProperties.xlsx")

=== CALCULATE PEPTIDE PROPERTIES ===
Loaded: 200 peptides
Calculating properties...

üìä PROPERTY SUMMARY:
       net_charge       gravy  hydrophobic_moment  length
count  200.000000  200.000000          200.000000   200.0
mean     8.565000   -1.732700            0.807598    20.0
std      2.089884    0.608548            0.383664     0.0
min      5.000000   -3.385000            0.034811    20.0
25%      7.000000   -2.040000            0.512857    20.0
50%      8.000000   -1.800000            0.782574    20.0
75%     10.000000   -1.561250            1.068653    20.0
max     15.000000    0.475000            1.958485    20.0

üî¨ TOP 5 PEPTIDES WITH PROPERTIES:
               Sequence  net_charge  gravy  hydrophobic_moment  length  \
0  RWKKYIQLKKWYIKKKKGRK          11 -1.980            0.325065      20   
1  KKYRYYMKLWKIKKKKKGRK          12 -2.150            0.200132      20   
2  RKGKYYLFKKYIKKWKWKRK          11 -1.955            0.576620      20   
3  NRRQRWWKKLKKYIKKKWRK          12 

In [11]:
import pandas as pd
import numpy as np

print("=== CALCULATE PEPTIDE PROPERTIES (EISENBERG UPDATE) ===")

# Load Step 3 results
df = pd.read_excel('Step3_WeightedEnsemble_Top200.xlsx')
print(f"Loaded: {len(df)} peptides")

# 1. Kyte-Doolittle Scale (Best for GRAVY)
kd_scale = {
    'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
    'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
}

# 2. Eisenberg Consensus Scale (Standard for Hydrophobic Moment)
eisenberg_scale = {
    'A': 0.62, 'R': -1.80, 'N': -0.78, 'D': -0.90, 'C': 0.29,
    'Q': -0.85, 'E': -0.74, 'G': 0.48, 'H': -0.40, 'I': 1.38,
    'L': 1.06, 'K': -1.50, 'M': 0.64, 'F': 1.19, 'P': 0.12,
    'S': -0.18, 'T': -0.05, 'W': 0.81, 'Y': 0.26, 'V': 1.08
}

def calculate_net_charge(seq):
    """Net charge calculation at neutral pH"""
    seq = str(seq).upper()
    pos = seq.count('R') + seq.count('K') + seq.count('H')
    neg = seq.count('D') + seq.count('E')
    return pos - neg

def calculate_gravy(seq):
    """GRAVY using Kyte-Doolittle scale"""
    seq = str(seq).upper()
    n = len(seq)
    if n == 0: return 0
    return sum(kd_scale.get(aa, 0) for aa in seq) / n

def calculate_hydrophobic_moment(seq, angle_deg=100):
    """Hydrophobic moment using Eisenberg scale for alpha-helices"""
    seq = str(seq).upper()
    n = len(seq)
    if n == 0: return 0
    
    angle_rad = np.deg2rad(angle_deg)
    sum_x = 0
    sum_y = 0
    
    for i, aa in enumerate(seq):
        h = eisenberg_scale.get(aa, 0)
        # Standard vector sum for residue positioning
        theta = (i + 1) * angle_rad
        sum_x += h * np.cos(theta)
        sum_y += h * np.sin(theta)
    
    return np.sqrt(sum_x**2 + sum_y**2) / n

# Calculate properties
print("Calculating properties...")
df['length'] = df['Sequence'].str.len()
df['net_charge'] = df['Sequence'].apply(calculate_net_charge)
df['gravy'] = df['Sequence'].apply(calculate_gravy)
df['hydrophobic_moment'] = df['Sequence'].apply(calculate_hydrophobic_moment)

# Display summary statistics
print("\nüìä PROPERTY SUMMARY:")
print(df[['net_charge', 'gravy', 'hydrophobic_moment', 'length']].describe())

# Save enhanced dataset as zzzz
output_file = 'zzzz.xlsx'
df.to_excel(output_file, index=False)

print(f"\n‚úÖ SAVED: {output_file}")
print("Your Hydrophobic Moment is now correctly calculated using the Eisenberg Scale.")

=== CALCULATE PEPTIDE PROPERTIES (EISENBERG UPDATE) ===
Loaded: 200 peptides
Calculating properties...

üìä PROPERTY SUMMARY:
       net_charge       gravy  hydrophobic_moment  length
count  200.000000  200.000000          200.000000   200.0
mean     8.565000   -1.732700            0.270899    20.0
std      2.089884    0.608548            0.134275     0.0
min      5.000000   -3.385000            0.016801    20.0
25%      7.000000   -2.040000            0.164660    20.0
50%      8.000000   -1.800000            0.251374    20.0
75%     10.000000   -1.561250            0.338049    20.0
max     15.000000    0.475000            0.685381    20.0

‚úÖ SAVED: zzzz.xlsx
Your Hydrophobic Moment is now correctly calculated using the Eisenberg Scale.


In [12]:
import pandas as pd

# 1. Load the data
input_file = 'zzzz.xlsx'
df = pd.read_excel(input_file)
print(f"Total peptides loaded: {len(df)}")

# 2. Apply the Synthesis Filters
# Net Charge: +4 to +9
# GRAVY: 0 to -1.8 (Note: 0 is the upper bound, -1.8 is the lower bound)
# Hydrophobic Moment (muH): > 0.35
filtered_df = df[
    (df['net_charge'] >= 4) & (df['net_charge'] <= 9) &
    (df['gravy'] <= 0) & (df['gravy'] >= -1.8) &
    (df['hydrophobic_moment'] > 0.35)
].copy()

print(f"Peptides passing filters: {len(filtered_df)}")

# 3. Save to Excel
excel_output = 'memzzzz.xlsx'
filtered_df.to_excel(excel_output, index=False)
print(f"‚úÖ Excel saved: {excel_output}")

# 4. Save to FASTA
fasta_output = 'memzzzz.fasta'
with open(fasta_output, 'w') as f:
    for i, row in filtered_df.iterrows():
        # Using index or a 'Sequence_ID' if available
        seq_id = f"Peptide_{i}_Charge{row['net_charge']}_uH{row['hydrophobic_moment']:.2f}"
        sequence = row['Sequence']
        f.write(f">{seq_id}\n{sequence}\n")

print(f"‚úÖ FASTA saved: {fasta_output}")

# Display the filtered candidates
if not filtered_df.empty:
    print("\nüî¨ PREVIEW OF FILTERED PEPTIDES:")
    print(filtered_df[['Sequence', 'net_charge', 'gravy', 'hydrophobic_moment']].head())
else:
    print("\n‚ö†Ô∏è No peptides matched those exact criteria. Consider loosening the GRAVY or uH constraints.")

Total peptides loaded: 200
Peptides passing filters: 11
‚úÖ Excel saved: memzzzz.xlsx
‚úÖ FASTA saved: memzzzz.fasta

üî¨ PREVIEW OF FILTERED PEPTIDES:
                 Sequence  net_charge  gravy  hydrophobic_moment
63   WGRESIKKLKKTEIKKWKKI           7 -1.455            0.434979
73   NGWRKKLEKLKELYKWKKKI           7 -1.690            0.420112
76   HARQTRIWKYLKKEIKKGKR           9 -1.790            0.487475
113  KKYRDYYYTLPKKYIKWWIK           6 -1.460            0.406331
126  QKKALYIRQYLKNEIKKGRK           7 -1.550            0.428125


In [14]:
"""
SIMPLE Ensemble Mean Probability - Top 50 Peptides ONLY
3-column format: Sequence,probability,prediction
NO model-specific filters - Pure mean probability ranking
"""

import pandas as pd
import numpy as np

# ============================
# 1. Load ALL 6 Model Files
# ============================
model_files = [
    'thres new_20_results_model1.csv',
    'thres new_20_results_model2.csv',
    'thres new_20_results_model3.csv',
    'thres new_20_results_model4.csv',
    'thres new_20_results_model5.csv',
    'thres new_20_results_model6.csv'
]

print("üî¨ Loading 6 model prediction files...")
dfs = []
for i, file in enumerate(model_files, 1):
    df = pd.read_csv(file)
    df.columns = ['Sequence', 'probability', 'prediction']  # Force 3-column format
    df['model_num'] = i
    dfs.append(df[['Sequence', 'probability']])
    print(f"  Model {i}: {len(df)} peptides")

# ============================
# 2. Merge on Sequence
# ============================
print("\nüîó Merging all models...")
df_all = dfs[0].copy()
df_all.columns = ['Sequence', 'model1_prob']

for i, df in enumerate(dfs[1:], 2):
    df_temp = df.copy()
    df_temp.columns = ['Sequence', f'model{i}_prob']
    df_all = df_all.merge(df_temp, on='Sequence', how='inner')

print(f"‚úÖ Merged: {len(df_all)} peptides in ALL models")

# ============================
# 3. Calculate MEAN Probability (ALL 6 MODELS)
# ============================
model_prob_cols = ['model1_prob', 'model2_prob', 'model3_prob', 
                  'model4_prob', 'model5_prob', 'model6_prob']

df_all['ensemble_mean_prob'] = df_all[model_prob_cols].mean(axis=1)

print(f"\nüìä Ensemble Mean Stats:")
print(f"Overall mean:  {df_all['ensemble_mean_prob'].mean():.3f}")
print(f"Max mean:      {df_all['ensemble_mean_prob'].max():.3f}")

# ============================
# 4. TOP 50 by Mean Probability
# ============================
top_50 = df_all.nlargest(50, 'ensemble_mean_prob')[['Sequence', 'ensemble_mean_prob'] + model_prob_cols]
top_50 = top_50.sort_values('ensemble_mean_prob', ascending=False).reset_index(drop=True)

print(f"\nü•á TOP 50 PEPTIDES SELECTED!")
print(f"#1 mean prob:  {top_50['ensemble_mean_prob'].iloc[0]:.3f}")
print(f"#50 mean prob: {top_50['ensemble_mean_prob'].iloc[49]:.3f}")

# ============================
# 5. Save Results
# ============================
top_50.to_excel('TOP_50_ENSEMBLE_MEAN_PEPTIDES.xlsx', index=False)
top_50.to_csv('TOP_50_ENSEMBLE_MEAN_PEPTIDES.csv', index=False)



print(f"\nüíæ SAVED:")
print(f"‚úÖ TOP_50_ENSEMBLE_MEAN_PEPTIDES.xlsx")
print(f"‚úÖ TOP_50_ENSEMBLE_MEAN_PEPTIDES.csv")



üî¨ Loading 6 model prediction files...
  Model 1: 3000 peptides
  Model 2: 3000 peptides
  Model 3: 3000 peptides
  Model 4: 3000 peptides
  Model 5: 3000 peptides
  Model 6: 3000 peptides

üîó Merging all models...
‚úÖ Merged: 3000 peptides in ALL models

üìä Ensemble Mean Stats:
Overall mean:  0.745
Max mean:      0.998

ü•á TOP 50 PEPTIDES SELECTED!
#1 mean prob:  0.998
#50 mean prob: 0.997

üíæ SAVED:
‚úÖ TOP_50_ENSEMBLE_MEAN_PEPTIDES.xlsx
‚úÖ TOP_50_ENSEMBLE_MEAN_PEPTIDES.csv


In [15]:
"""
Convert TOP_50_ENSEMBLE_MEAN_PEPTIDES.csv ‚Üí FASTA format
Simple: Sequence column ‚Üí >Peptide_1, >Peptide_2, etc.
"""

import pandas as pd

# ============================
# Load Top 50 Results
# ============================
print("üî¨ Loading TOP_50_ENSEMBLE_MEAN_PEPTIDES.csv...")
df = pd.read_csv('TOP_50_ENSEMBLE_MEAN_PEPTIDES.csv')

print(f"Loaded: {len(df)} peptides")
print(f"Columns: {df.columns.tolist()}")

# ============================
# Convert to FASTA
# ============================
fasta_content = []
for i, row in df.iterrows():
    seq_name = f"Peptide_{i+1}_meanProb_{row['ensemble_mean_prob']:.3f}"
    sequence = row['Sequence']
    fasta_content.append(f">{seq_name}")
    fasta_content.append(sequence)

# ============================
# Save FASTA
# ============================
with open('TOP_50_ENSEMBLE_MEAN_PEPTIDES.fasta', 'w') as f:
    f.write('\n'.join(fasta_content))

print(f"\n‚úÖ SAVED: TOP_50_ENSEMBLE_MEAN_PEPTIDES.fasta")
print(f"  Contains {len(df)} peptides")
print(f"\nPreview:")
print(">" + fasta_content[0])
print(fasta_content[1][:60] + "...")

# ============================
# Also save numbered version (synthesis-friendly)
# ============================
with open('TOP_50_PEPTIDES_NUMBERED.fasta', 'w') as f:
    for i, row in df.iterrows():
        f.write(f">{i+1}\n")
        f.write(f"{row['Sequence']}\n")

print(f"‚úÖ SAVED: TOP_50_PEPTIDES_NUMBERED.fasta")
print("\nüéØ READY FOR SYNTHESIS ORDERS!")


üî¨ Loading TOP_50_ENSEMBLE_MEAN_PEPTIDES.csv...
Loaded: 50 peptides
Columns: ['Sequence', 'ensemble_mean_prob', 'model1_prob', 'model2_prob', 'model3_prob', 'model4_prob', 'model5_prob', 'model6_prob']

‚úÖ SAVED: TOP_50_ENSEMBLE_MEAN_PEPTIDES.fasta
  Contains 50 peptides

Preview:
>>Peptide_1_meanProb_0.998
NRRQRWWKKLKKYIKKKWRK...
‚úÖ SAVED: TOP_50_PEPTIDES_NUMBERED.fasta

üéØ READY FOR SYNTHESIS ORDERS!
