In [None]:
import pandas as pd

# Load both datasets
df_clean = pd.read_excel('length_filtered_epitopes.xlsx')
df_minmax = pd.read_csv('minmax_scaled_4lac_data.csv')

# Get unique epitopes from both datasets and convert to uppercase
clean_epitopes = set(df_clean['Epitope Name'].str.upper())
minmax_epitopes = set(df_minmax['epitope'].str.upper())

# Find common epitopes
common_epitopes = clean_epitopes.intersection(minmax_epitopes)

# Filter df_minmax to keep only common epitopes
df_common_minmax = df_minmax[df_minmax['epitope'].str.upper().isin(common_epitopes)]

# Save the filtered data to a new CSV file
df_common_minmax.to_csv('common_B_epitopes_data.csv', index=False)

print("Data for common epitopes saved to common_B_epitopes_data.csv")

In [None]:
import pandas as pd

# Load the filtered CSV file
filtered_file_path = 'common_B_epitopes_data.csv'  
df = pd.read_csv(filtered_file_path)

# Count positive and negative values in the 'Label' column
positive_count = (df['Label'] == 1).sum()
negative_count = (df['Label'] == 0).sum()

print(f"Number of Positive values (1): {positive_count}")
print(f"Number of Negative values (0): {negative_count}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the Excel file
df = pd.read_csv('common_B_epitopes_data.csv')

# Count the occurrences of each class in 'Assay Qualitative Measure'
class_counts = df['Label'].value_counts()

# Create a pie chart
plt.figure(figsize=(10, 8))
plt.pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Epitope Classes')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()

# Print the class counts
print("\
Class Counts:")
print(class_counts)

In [None]:
#BEFORE APPLYING SMOTE, SPLITTING OF DATA IS CRUCIAL SO THAT EXTERNAL AND TEST DATA REMAINS REAL. SMOTE WILL ONLY BE APPLIED ON TRAINING DATA

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load the dataset
data = pd.read_csv("common_B_epitopes_data.csv")  # Replace with actual path

# Step 2: Separate the epitope column, features, and labels
epitopes = data.iloc[:, 0]  # First column contains epitope IDs
X = data.iloc[:, 2:]         # All columns from the 3rd onward are features
y = data.iloc[:, 1]          # Second column is the label

# Step 3: Perform the first split - 80% (train + test) and 20% (external evaluation)
X_temp, X_eval, y_temp, y_eval, epitopes_temp, epitopes_eval = train_test_split(
    X, y, epitopes, test_size=0.2, stratify=y, random_state=42
)

# Step 4: Perform the second split - 60% train and 20% test from the 80% temporary data
X_train, X_test, y_train, y_test, epitopes_train, epitopes_test = train_test_split(
    X_temp, y_temp, epitopes_temp, test_size=0.25, stratify=y_temp, random_state=42  # 0.25 * 80% = 20%
)

# Step 5: Save the datasets to CSV files, including the epitope column
train_data = pd.concat([epitopes_train.reset_index(drop=True), y_train.reset_index(drop=True), X_train.reset_index(drop=True)], axis=1)
test_data = pd.concat([epitopes_test.reset_index(drop=True), y_test.reset_index(drop=True), X_test.reset_index(drop=True)], axis=1)
eval_data = pd.concat([epitopes_eval.reset_index(drop=True), y_eval.reset_index(drop=True), X_eval.reset_index(drop=True)], axis=1)

# Step 6: Save to CSV
train_data.to_csv("train_B_data.csv", index=False)
test_data.to_csv("test_B_data.csv", index=False)
eval_data.to_csv("external_eval_B_data.csv", index=False)

# Verify sizes
print(f"Training set saved with size: {train_data.shape}")
print(f"Test set saved with size: {test_data.shape}")
print(f"External evaluation set saved with size: {eval_data.shape}")

In [None]:
##APPLICATION OF SMOTE ON TRAINING DATA
!pip install --upgrade imbalanced-learn
import imblearn
print(imblearn.__version__)
# Check the installed version of scikit-learn
import sklearn
print("Installed scikit-learn version:", sklearn.__version__)

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
import pandas as pd

# Step 1: Load the training data
train_data = pd.read_csv("train_B_data.csv")

# Step 2: Separate epitope sequences, features, and labels
epitopes_train = train_data.iloc[:, 0]  # Epitope sequences/IDs
y_train = train_data.iloc[:, 1]         # Label column
X_train = train_data.iloc[:, 2:]        # Feature columns

# Step 3: Apply Borderline-SMOTE (without epitopes)
smote = BorderlineSMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Step 4: Handle epitope IDs for synthetic samples
# Original IDs (before SMOTE)
epitopes_smote = epitopes_train.reset_index(drop=True)

# Calculate number of synthetic samples generated
num_synthetic = len(X_train_smote) - len(X_train)

# Create placeholder epitope IDs for synthetic samples
synthetic_ids = [f"synthetic_epitope_{i+1}" for i in range(num_synthetic)]

# Concatenate original and synthetic IDs
all_epitopes = pd.concat([epitopes_smote, pd.Series(synthetic_ids)], ignore_index=True)

# Step 5: Combine all data into a single DataFrame
train_smote_data = pd.concat([
    all_epitopes,                        # Epitope sequences/IDs
    pd.Series(y_train_smote).reset_index(drop=True),  # Labels
    pd.DataFrame(X_train_smote).reset_index(drop=True)  # Features
], axis=1)

# Step 6: Save the SMOTE-augmented training data
train_smote_data.to_csv("train_Boarderline_smote_B_data.csv", index=False, header=True)

# Verify size of the SMOTE-augmented data
print(f"SMOTE-augmented training data saved with size: {train_smote_data.shape}") 