#File Preparation Script

In [None]:
#The first chunk of code takes three csv files generated via Biopython and Pfeature and then retains only the features used in training of XGB-BPred model.
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load all CSV files
file_A = pd.read_csv("Bcell_Biopython.csv") # Features from Biopython
file_B = pd.read_csv("Bcell_comp.csv") # Composition Features from Pfeature
file_C = pd.read_csv("Bcell_bin.csv") # Binary Features from Pfeature
file_D = pd.read_csv("test_B_data.csv") #test data given in github repository

# Step 1: Merge A, B, and C into E
# Assuming you want to concatenate rows
E = pd.concat([file_A, file_B, file_C], ignore_index=True)

# Step 2: Match columns of E with columns in D
# Get column names from D
columns_in_D = file_D.columns

# Retain only columns in E that are present in D
E = E[columns_in_D.intersection(E.columns)]

# Step 3: Add missing columns with zero values
for col in columns_in_D:
    if col not in E.columns:
        E[col] = 0  # Add missing column and fill with zeros

# Step 4: Reorder columns to match the order in D
E_final = E[columns_in_D]

# Step 5: Save the final E.csv
E_final.to_csv("Bcell_merged.csv", index=False)

print("Bcell_merged.csv has been successfully created with the exact columns from test_B_data.csv")


In [None]:
 #The second chunk of code normalizes the extracted features. Finalized file can be easily employed for prediction purposes.

# Load the data
df = pd.read_csv('Bcell_merged.csv')

# Separate the first two columns
epitope_label = df.iloc[:, :2]
features = df.iloc[:, 2:]

# Initialize scalers
minmax_scaler = MinMaxScaler()

# Apply MinMaxScaler
minmax_scaled = minmax_scaler.fit_transform(features)
minmax_scaled_df = pd.DataFrame(minmax_scaled, columns=features.columns)

# Combine with epitope and label columns
standard_scaled_final = pd.concat([epitope_label, standard_scaled_df], axis=1)
minmax_scaled_final = pd.concat([epitope_label, minmax_scaled_df], axis=1)

# Save the normalized datasets
minmax_scaled_final.to_csv('Epitopes_scaled.csv', index=False)

# Display the first few rows of both normalized datasets
print("\
First few rows of MinMaxScaler normalized data:")
print(minmax_scaled_final.head())