In [4]:
import pandas as pd

# Step 1: Read the CSV file
df = pd.read_csv('thres new_20_results_model5.csv')

print(f"Total peptides loaded: {len(df)}")
print(f"Columns: {df.columns.tolist()}")

# Step 2: Filter for prediction = 1 only
predicted_active = df[df['prediction'] == 1].copy()

print(f"\nPeptides with prediction = 1: {len(predicted_active)}")
print(f"Peptides with prediction = 0: {len(df[df['prediction'] == 0])}")

# Step 3: Display the filtered data
print("\nFiltered peptides (prediction = 1):")
print(predicted_active.head(10))

# Step 4: Save to Excel file
output_file = 'Model5_Predicted_Active_Peptides.xlsx'
predicted_active.to_excel(output_file, index=False, sheet_name='Active Peptides')

print(f"\n‚úÖ Excel file saved: {output_file}")
print(f"Total active peptides in Excel: {len(predicted_active)}")


Total peptides loaded: 3000
Columns: ['Sequence', 'probability', 'prediction']

Peptides with prediction = 1: 2541
Peptides with prediction = 0: 459

Filtered peptides (prediction = 1):
                Sequence  probability  prediction
2   ADYWRIAKELRRYIRKVGRI     0.994740           1
3   AECSGCICYWRRCRCCQVIK     0.948737           1
5   AGAEEKIRQKLKNEIKKGRK     0.995973           1
6   AGAKRIFNARRLKKIQEGKI     0.996008           1
7   AGANRLTKELLEYLRKFGKI     0.984361           1
8   AGANRLTKELLEYLRKFKKI     0.991034           1
9   AGANRLTWELLKEYLRKRKK     0.992804           1
10  AGANRLWLYLKEYLRKRGKK     0.993059           1
11  AGAQRIWKELRRYIRKVGRI     0.995904           1
12  AGAQRLKKKELYLRKRKGKI     0.997817           1

‚úÖ Excel file saved: Model5_Predicted_Active_Peptides.xlsx
Total active peptides in Excel: 2541


In [7]:
import pandas as pd
import numpy as np

print("=== STEP 2: Precision Filter with Models 4 & 6 ===")

# Step 1: Load Model 5 active peptides (EXCEL file)
df_model5 = pd.read_excel('Model5_Predicted_Active_Peptides.xlsx')
print(f"Model 5 actives loaded: {len(df_model5)} peptides")
print(f"Model 5 columns: {df_model5.columns.tolist()}")

# Step 2: Load Model 4 and Model 6 prediction files (CSV)
df_model4 = pd.read_csv('thres new_20_results_model4.csv')
df_model6 = pd.read_csv('thres new_20_results_model6.csv')

print(f"Model 4 total: {len(df_model4)} peptides")
print(f"Model 6 total: {len(df_model6)} peptides")

# Step 3: Merge all three datasets on 'Sequence' column
df_merged = df_model5.merge(
    df_model4[['Sequence', 'probability']], 
    left_on='Sequence', 
    right_on='Sequence', 
    suffixes=('', '_model4')
)

df_merged = df_merged.merge(
    df_model6[['Sequence', 'probability']], 
    left_on='Sequence', 
    right_on='Sequence', 
    suffixes=('', '_model6')
)

# Rename probability columns for clarity
df_merged = df_merged.rename(columns={
    'probability_model4': 'model4_prob',
    'probability_model6': 'model6_prob'
})

print(f"Successfully merged: {len(df_merged)} peptides (in all 3 models)")

# Step 4: Calculate Models 4+6 mean probability
df_merged['models46_mean'] = (df_merged['model4_prob'] + df_merged['model6_prob']) / 2

# Step 5: Filter where Models 4+6 mean > 0.70 (high confidence)
step2_final = df_merged[df_merged['models46_mean'] > 0.70].copy()
step2_final = step2_final.sort_values('models46_mean', ascending=False)

print(f"\n‚úÖ STEP 2 COMPLETE!")
print(f"High-confidence candidates: {len(step2_final)} peptides")
print(f"Avg Model 4 prob: {step2_final['model4_prob'].mean():.3f}")
print(f"Avg Model 6 prob: {step2_final['model6_prob'].mean():.3f}")
print(f"Avg Models 4+6 mean: {step2_final['models46_mean'].mean():.3f}")

# Step 6: Save results to Excel
output_columns = ['Sequence', 'prediction', 'probability', 'model4_prob', 'model6_prob', 'models46_mean']
step2_final[output_columns].to_excel('Step2_Models4_6_HighConf.xlsx', index=False)

print(f"\nüìä Excel saved: Step2_Models4_6_HighConf.xlsx")
print("\nTop 5 high-confidence candidates:")
print(step2_final[['Sequence', 'model4_prob', 'model6_prob', 'models46_mean']].head())

print("\nüéØ NEXT: Step 3 - Weighted Ensemble Ranking!")
print("Use: Step2_Models4_6_HighConf.xlsx")


=== STEP 2: Precision Filter with Models 4 & 6 ===
Model 5 actives loaded: 2541 peptides
Model 5 columns: ['Sequence', 'probability', 'prediction']
Model 4 total: 3000 peptides
Model 6 total: 3000 peptides
Successfully merged: 2541 peptides (in all 3 models)

‚úÖ STEP 2 COMPLETE!
High-confidence candidates: 1824 peptides
Avg Model 4 prob: 0.940
Avg Model 6 prob: 0.946
Avg Models 4+6 mean: 0.943

üìä Excel saved: Step2_Models4_6_HighConf.xlsx

Top 5 high-confidence candidates:
                  Sequence  model4_prob  model6_prob  models46_mean
1500  NRRQRWWKKLKKYIKKKWRK     0.998923     0.999912       0.999418
1787  RKGKYYLFKKYIKKWKWKRK     0.998816     0.999899       0.999358
1923  RWKKYIQLKKWYIKKKKGRK     0.998744     0.999873       0.999309
761   HRKRWKIWRFLNKKKAKKIK     0.998689     0.999845       0.999267
1012  KKYRYYMKLWKIKKKKKGRK     0.998622     0.999851       0.999236

üéØ NEXT: Step 3 - Weighted Ensemble Ranking!
Use: Step2_Models4_6_HighConf.xlsx


In [8]:
import pandas as pd
import numpy as np

print("=== STEP 3: Weighted Ensemble Ranking ===")

# Load Step 2 results
df_step2 = pd.read_excel('Step2_Models4_6_HighConf.xlsx')
print(f"Step 2 high-confidence: {len(df_step2)} peptides")

# Calculate weighted ensemble score
# Weights: 0.4√óModel4 + 0.4√óModel5 + 0.2√óModel6 (performance-based)
df_step2['ensemble_score'] = (
    0.4 * df_step2['model4_prob'] + 
    0.4 * df_step2['probability'] +  # Model 5 probability
    0.2 * df_step2['model6_prob']
)

# Filter ensemble_score > 0.65 and take top 200
step3_final = df_step2[df_step2['ensemble_score'] > 0.65].copy()
step3_final = step3_final.sort_values('ensemble_score', ascending=False).head(200)

print(f"\n‚úÖ STEP 3 COMPLETE!")
print(f"Top 200 ensemble candidates: {len(step3_final)} peptides")
print(f"Ensemble score range: {step3_final['ensemble_score'].min():.3f} - {step3_final['ensemble_score'].max():.3f}")
print(f"Avg ensemble score: {step3_final['ensemble_score'].mean():.3f}")

# Save Step 3 results
step3_final[['Sequence', 'probability', 'model4_prob', 'model6_prob', 
             'models46_mean', 'ensemble_score']].to_excel(
    'Step3_WeightedEnsemble_Top200.xlsx', index=False
)

print(f"\nüìä Excel saved: Step3_WeightedEnsemble_Top200.xlsx")
print("\nüèÜ TOP 5 ENSEMBLE CANDIDATES:")
print(step3_final[['Sequence', 'model4_prob', 'model6_prob', 'ensemble_score']].head())

print("\nüéØ NEXT: Step 4 - Synthesis Filters!")
print("Use: Step3_WeightedEnsemble_Top200.xlsx")


=== STEP 3: Weighted Ensemble Ranking ===
Step 2 high-confidence: 1824 peptides

‚úÖ STEP 3 COMPLETE!
Top 200 ensemble candidates: 200 peptides
Ensemble score range: 0.997 - 0.999
Avg ensemble score: 0.998

üìä Excel saved: Step3_WeightedEnsemble_Top200.xlsx

üèÜ TOP 5 ENSEMBLE CANDIDATES:
               Sequence  model4_prob  model6_prob  ensemble_score
0  NRRQRWWKKLKKYIKKKWRK     0.998923     0.999912        0.999241
2  RWKKYIQLKKWYIKKKKGRK     0.998744     0.999873        0.999217
1  RKGKYYLFKKYIKKWKWKRK     0.998816     0.999899        0.999195
3  HRKRWKIWRFLNKKKAKKIK     0.998689     0.999845        0.999157
4  KKYRYYMKLWKIKKKKKGRK     0.998622     0.999851        0.999145

üéØ NEXT: Step 4 - Synthesis Filters!
Use: Step3_WeightedEnsemble_Top200.xlsx


In [9]:
import pandas as pd
import numpy as np

print("=== CALCULATE PEPTIDE PROPERTIES ===")

# Load Step 3 results
df = pd.read_excel('Step3_WeightedEnsemble_Top200.xlsx')
print(f"Loaded: {len(df)} peptides")

# Amino acid hydrophobicity values for GRAVY (Kyte-Doolittle scale)
hydrophobicity = {
    'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
    'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
}

# Function to calculate NET CHARGE (simple count at neutral pH)
def calculate_net_charge(seq):
    """Net charge: +1 for R/K/H, -1 for D/E"""
    seq = seq.upper()
    positive = seq.count('R') + seq.count('K') + seq.count('H')
    negative = seq.count('D') + seq.count('E')
    return positive - negative

# Function to calculate GRAVY (Grand Average of Hydropathicity)
def calculate_gravy(seq):
    """GRAVY = average hydrophobicity per residue"""
    seq = seq.upper()
    n = len(seq)
    if n == 0:
        return 0
    gravy = sum(hydrophobicity.get(aa, 0) for aa in seq) / n
    return gravy

# Function to calculate HYDROPHOBIC MOMENT (MuH) - simplified Eisenberg
def calculate_hydrophobic_moment(seq):
    """Simplified hydrophobic moment for amphipathicity"""
    seq = seq.upper()
    n = len(seq)
    if n == 0:
        return 0
    
    # Angles (radians) for hydrophobic moment calculation
    angle = np.pi * 100 / 180  # 100¬∞ angle between hydrophobic/hydrophilic
    
    sum_x = sum_y = 0
    for i, aa in enumerate(seq):
        h = hydrophobicity.get(aa, 0)
        theta = i * angle
        sum_x += h * np.cos(theta)
        sum_y += h * np.sin(theta)
    
    muh = np.sqrt(sum_x**2 + sum_y**2) / n
    return muh

# Calculate ALL properties
print("Calculating properties...")
df['length'] = df['Sequence'].str.len()
df['net_charge'] = df['Sequence'].apply(calculate_net_charge)
df['gravy'] = df['Sequence'].apply(calculate_gravy)
df['hydrophobic_moment'] = df['Sequence'].apply(calculate_hydrophobic_moment)

# Display summary statistics
print("\nüìä PROPERTY SUMMARY:")
print(df[['net_charge', 'gravy', 'hydrophobic_moment', 'length']].describe())

print("\nüî¨ TOP 5 PEPTIDES WITH PROPERTIES:")
display_cols = ['Sequence', 'net_charge', 'gravy', 'hydrophobic_moment', 'length', 'ensemble_score']
print(df[display_cols].head())

# Save enhanced dataset with properties
output_file = 'Step3_Top200_WithProperties.xlsx'
df.to_excel(output_file, index=False)

print(f"\n‚úÖ SAVED: Step3_Top200_WithProperties.xlsx")
print(f"All 200 peptides now have: net_charge, gravy, hydrophobic_moment, length")

print("\nüéØ READY FOR STEP 4 SYNTHESIS FILTERS!")
print("Use: Step3_Top200_WithProperties.xlsx")


=== CALCULATE PEPTIDE PROPERTIES ===
Loaded: 200 peptides
Calculating properties...

üìä PROPERTY SUMMARY:
       net_charge       gravy  hydrophobic_moment  length
count  200.000000  200.000000          200.000000   200.0
mean     8.920000   -1.782850            0.780378    20.0
std      2.197212    0.625932            0.388285     0.0
min      5.000000   -3.385000            0.034811    20.0
25%      7.000000   -2.090000            0.474674    20.0
50%      9.000000   -1.860000            0.742403    20.0
75%     10.000000   -1.570000            1.026598    20.0
max     15.000000    0.475000            2.008551    20.0

üî¨ TOP 5 PEPTIDES WITH PROPERTIES:
               Sequence  net_charge  gravy  hydrophobic_moment  length  \
0  NRRQRWWKKLKKYIKKKWRK          12 -2.595            1.280620      20   
1  RWKKYIQLKKWYIKKKKGRK          11 -1.980            0.325065      20   
2  RKGKYYLFKKYIKKWKWKRK          11 -1.955            0.576620      20   
3  HRKRWKIWRFLNKKKAKKIK          12 

In [10]:
import pandas as pd
import numpy as np

print("=== CALCULATE PEPTIDE PROPERTIES (EISENBERG UPDATE) ===")

# Load Step 3 results
df = pd.read_excel('Step3_WeightedEnsemble_Top200.xlsx')
print(f"Loaded: {len(df)} peptides")

# 1. Kyte-Doolittle Scale (Best for GRAVY)
kd_scale = {
    'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
    'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
}

# 2. Eisenberg Consensus Scale (Standard for Hydrophobic Moment)
eisenberg_scale = {
    'A': 0.62, 'R': -1.80, 'N': -0.78, 'D': -0.90, 'C': 0.29,
    'Q': -0.85, 'E': -0.74, 'G': 0.48, 'H': -0.40, 'I': 1.38,
    'L': 1.06, 'K': -1.50, 'M': 0.64, 'F': 1.19, 'P': 0.12,
    'S': -0.18, 'T': -0.05, 'W': 0.81, 'Y': 0.26, 'V': 1.08
}

def calculate_net_charge(seq):
    """Net charge calculation at neutral pH"""
    seq = str(seq).upper()
    pos = seq.count('R') + seq.count('K') + seq.count('H')
    neg = seq.count('D') + seq.count('E')
    return pos - neg

def calculate_gravy(seq):
    """GRAVY using Kyte-Doolittle scale"""
    seq = str(seq).upper()
    n = len(seq)
    if n == 0: return 0
    return sum(kd_scale.get(aa, 0) for aa in seq) / n

def calculate_hydrophobic_moment(seq, angle_deg=100):
    """Hydrophobic moment using Eisenberg scale for alpha-helices"""
    seq = str(seq).upper()
    n = len(seq)
    if n == 0: return 0
    
    angle_rad = np.deg2rad(angle_deg)
    sum_x = 0
    sum_y = 0
    
    for i, aa in enumerate(seq):
        h = eisenberg_scale.get(aa, 0)
        # Standard vector sum for residue positioning
        theta = (i + 1) * angle_rad
        sum_x += h * np.cos(theta)
        sum_y += h * np.sin(theta)
    
    return np.sqrt(sum_x**2 + sum_y**2) / n

# Calculate properties
print("Calculating properties...")
df['length'] = df['Sequence'].str.len()
df['net_charge'] = df['Sequence'].apply(calculate_net_charge)
df['gravy'] = df['Sequence'].apply(calculate_gravy)
df['hydrophobic_moment'] = df['Sequence'].apply(calculate_hydrophobic_moment)

# Display summary statistics
print("\nüìä PROPERTY SUMMARY:")
print(df[['net_charge', 'gravy', 'hydrophobic_moment', 'length']].describe())

# Save enhanced dataset as zzzz
output_file = 'zzzz.xlsx'
df.to_excel(output_file, index=False)

print(f"\n‚úÖ SAVED: {output_file}")
print("Your Hydrophobic Moment is now correctly calculated using the Eisenberg Scale.")

=== CALCULATE PEPTIDE PROPERTIES (EISENBERG UPDATE) ===
Loaded: 200 peptides
Calculating properties...

üìä PROPERTY SUMMARY:
       net_charge       gravy  hydrophobic_moment  length
count  200.000000  200.000000          200.000000   200.0
mean     8.920000   -1.782850            0.264947    20.0
std      2.197212    0.625932            0.135420     0.0
min      5.000000   -3.385000            0.014614    20.0
25%      7.000000   -2.090000            0.155477    20.0
50%      9.000000   -1.860000            0.248870    20.0
75%     10.000000   -1.570000            0.337792    20.0
max     15.000000    0.475000            0.685381    20.0

‚úÖ SAVED: zzzz.xlsx
Your Hydrophobic Moment is now correctly calculated using the Eisenberg Scale.


In [11]:
import pandas as pd

# 1. Load the data
input_file = 'zzzz.xlsx'
df = pd.read_excel(input_file)
print(f"Total peptides loaded: {len(df)}")

# 2. Apply the Synthesis Filters
# Net Charge: +4 to +9
# GRAVY: 0 to -1.8 (Note: 0 is the upper bound, -1.8 is the lower bound)
# Hydrophobic Moment (muH): > 0.35
filtered_df = df[
    (df['net_charge'] >= 4) & (df['net_charge'] <= 9) &
    (df['gravy'] <= 0) & (df['gravy'] >= -1.8) &
    (df['hydrophobic_moment'] > 0.35)
].copy()

print(f"Peptides passing filters: {len(filtered_df)}")

# 3. Save to Excel
excel_output = 'memzzzz.xlsx'
filtered_df.to_excel(excel_output, index=False)
print(f"‚úÖ Excel saved: {excel_output}")

# 4. Save to FASTA
fasta_output = 'memzzzz.fasta'
with open(fasta_output, 'w') as f:
    for i, row in filtered_df.iterrows():
        # Using index or a 'Sequence_ID' if available
        seq_id = f"Peptide_{i}_Charge{row['net_charge']}_uH{row['hydrophobic_moment']:.2f}"
        sequence = row['Sequence']
        f.write(f">{seq_id}\n{sequence}\n")

print(f"‚úÖ FASTA saved: {fasta_output}")

# Display the filtered candidates
if not filtered_df.empty:
    print("\nüî¨ PREVIEW OF FILTERED PEPTIDES:")
    print(filtered_df[['Sequence', 'net_charge', 'gravy', 'hydrophobic_moment']].head())
else:
    print("\n‚ö†Ô∏è No peptides matched those exact criteria. Consider loosening the GRAVY or uH constraints.")

Total peptides loaded: 200
Peptides passing filters: 11
‚úÖ Excel saved: memzzzz.xlsx
‚úÖ FASTA saved: memzzzz.fasta

üî¨ PREVIEW OF FILTERED PEPTIDES:
                 Sequence  net_charge  gravy  hydrophobic_moment
51   NGWRKKLEKLKELYKWKKKI           7 -1.690            0.420112
68   WGRESIKKLKKTEIKKWKKI           7 -1.455            0.434979
70   HARQTRIWKYLKKEIKKGKR           9 -1.790            0.487475
133  KKYRDYYYTLPKKYIKWWIK           6 -1.460            0.406331
149  WLYKKYHYYFDKYIKKKGRW           7 -1.610            0.388529
