In [None]:
# Load the original file and get initial counts
import pandas as pd

# Load original file
df_original = pd.read_excel('B cell 1.xlsx', engine='calamine')

# Store initial count
initial_count = len(df_original)

# Get list of all original epitopes
print("Initial Epitopes:")
print(df_original['Epitope Name'].tolist())
print("\nTotal initial epitopes:", initial_count)

# Filter out human/Homo sapiens
df_filtered = df_original[~df_original['Epitope Species'].str.lower().isin(['human', 'homo sapiens'])]

# Save filtered data
df_filtered.to_excel('B_cell_1_filtered.xlsx', index=False)

# Get counts of removed and remaining
removed_count = initial_count - len(df_filtered)
remaining_count = len(df_filtered)

print("\nNumber of epitopes removed:", removed_count)
print("Number of epitopes remaining:", remaining_count)

print("\nRemaining Epitopes:")
print(df_filtered['Epitope Name'].tolist())


Based on the analysis:

- Total initial epitopes: 707,608
- Number of epitopes removed: 286
- Number of epitopes remaining: 707,322

The filtered data has been saved to 'B_cell_1_filtered.xlsx'. 


In [None]:
# Load the filtered file
filtered_file = 'B_cell_1_filtered.xlsx'
df_filtered = pd.read_excel(filtered_file, engine='openpyxl')

# Check for inconsistent assay qualitative measures
# Group by 'Epitope Name' and check if there are both 'Positive' and 'Negative' values in 'Assay Qualitative Measure'
inconsistent_epitopes = df_filtered.groupby('Epitope Name')['Assay Qualitative Measure'].apply(lambda x: set(x)).apply(lambda x: 'Positive' in x and 'Negative' in x)

# Count inconsistent and consistent epitopes
inconsistent_count = inconsistent_epitopes.sum()
consistent_count = len(inconsistent_epitopes) - inconsistent_count

# Get the list of inconsistent epitopes
inconsistent_epitopes_list = inconsistent_epitopes[inconsistent_epitopes].index.tolist()

print("Number of inconsistent epitopes:", inconsistent_count)
print("Number of consistent epitopes:", consistent_count)
print("\nList of inconsistent epitopes:")
print(inconsistent_epitopes_list)

The analysis successfully identified epitopes with inconsistent assay qualitative measures, showing 42,551 inconsistent epitopes and 383,027 consistent ones. I will now summarize the results and provide further insights if needed.
- **Number of inconsistent epitopes**: 42,551  
- **Number of consistent epitopes**: 383,027  
The inconsistent epitopes are those that show both "Positive" and "Negative" assay qualitative measures across different assays. The consistent epitopes have uniform results. 

In [None]:
# Load the filtered file
df_filtered = pd.read_excel('B_cell_1_filtered.xlsx')

# Get consistent epitopes
consistent_epitopes = df_filtered.groupby('Epitope Name')['Assay Qualitative Measure'].apply(lambda x: set(x)).apply(lambda x: not ('Positive' in x and 'Negative' in x))
consistent_epitopes = consistent_epitopes[consistent_epitopes].index.tolist()

# Filter dataframe to keep only consistent epitopes
df_consistent = df_filtered[df_filtered['Epitope Name'].isin(consistent_epitopes)]

# Show unique values in Assay Qualitative Measure before replacement
print("Unique values in Assay Qualitative Measure before replacement:")
print(df_consistent['Assay Qualitative Measure'].unique())

# Replace Positive variants with just 'Positive'
df_consistent['Assay Qualitative Measure'] = df_consistent['Assay Qualitative Measure'].replace({
    'Positive-Low': 'Positive',
    'Positive-Intermediate': 'Positive',
    'Positive-High': 'Positive'
})

print("\nUnique values in Assay Qualitative Measure after replacement:")
print(df_consistent['Assay Qualitative Measure'].unique())

# Save to new excel file
output_file = 'consistent_epitopes.xlsx'
df_consistent.to_excel(output_file, index=False)
print(f"\nSaved consistent epitopes to {output_file}")

# Count frequency of each value
value_counts = df_consistent['Assay Qualitative Measure'].value_counts()
print("\nFrequency of each value:")
print(value_counts)


The process successfully identified unique values in the "Assay Qualitative Measure" column, replaced all "Positive-Low," "Positive-Intermediate," and "Positive-High" with "Positive," and saved the consistent epitopes to a new file. Below are the results:
- **Unique values before replacement**: {outputs_dict["db237a58"]}
- **Unique values after replacement**: {outputs_dict["ff90325d"]}
- **Frequency of each value**: {outputs_dict["6cdec8f5"]}
The consistent epitopes have been saved to `consistent_epitopes.xlsx`.

In [None]:
# Load the consistent epitopes file
df = pd.read_excel('consistent_epitopes.xlsx')

# Define the 20 standard amino acids
standard_aa = set('ACDEFGHIKLMNPQRSTVWY')

# Function to check if sequence contains only natural amino acids
def contains_only_natural_aa(sequence):
    try:
        return set(str(sequence).upper()).issubset(standard_aa)
    except:
        return False

# Get initial count
initial_count = len(df)

# Filter to keep only sequences with natural amino acids
df_natural = df[df['Epitope Name'].apply(contains_only_natural_aa)]

# Get final count
final_count = len(df_natural)

# Save to new file
output_file = 'consistent_natural_epitopes.xlsx'
df_natural.to_excel(output_file, index=False)

print(f"Initial number of epitopes: {initial_count}")
print(f"Number of epitopes removed: {initial_count - final_count}")
print(f"Final number of epitopes: {final_count}")

# Show some examples of removed epitopes
removed_df = df[~df['Epitope Name'].apply(contains_only_natural_aa)]
print("\nExamples of removed epitopes (first 10):")
print(removed_df['Epitope Name'].head(10).tolist())

# Count frequency of Assay Qualitative Measure in final dataset
print("\nFrequency of Assay Qualitative Measure in final dataset:")
print(df_natural['Assay Qualitative Measure'].value_counts())

The process successfully filtered out epitopes containing non-natural amino acids, reducing the dataset by 1,058 entries. The final dataset contains 560,592 epitopes, and the updated file has been saved. Below are the results:

- **Initial number of epitopes**: {outputs_dict["ac1202f0"]}
- **Number of epitopes removed**: {outputs_dict["6396ff8d"]}
- **Final number of epitopes**: {outputs_dict["23e1455e"]}
- **Examples of removed epitopes**: {outputs_dict["063e60b1"]}
- **Frequency of Assay Qualitative Measure in final dataset**: {outputs_dict["a2793d25"]}

# Redundant Entry Count

In [None]:
# Load the consistent natural epitopes file
df = pd.read_excel('consistent_natural_epitopes.xlsx')

# Check for duplicate epitopes
duplicate_epitopes = df[df.duplicated(subset=['Epitope Name'], keep=False)]
unique_duplicate_epitopes = duplicate_epitopes['Epitope Name'].unique()

# Count total unique epitopes
total_rows = len(df)
unique_epitopes = len(df['Epitope Name'].unique())
duplicate_count = len(unique_duplicate_epitopes)

print(f"Total rows in dataset: {total_rows}")
print(f"Number of unique epitopes: {unique_epitopes}")
print(f"Number of epitopes with duplicates: {duplicate_count}")

# Show example of duplicates with their counts and assay measures
if len(unique_duplicate_epitopes) > 0:
    print("\nExample of first 5 duplicate epitopes and their details:")
    for epitope in unique_duplicate_epitopes[:5]:
        subset = df[df['Epitope Name'] == epitope]
        print(f"\nEpitope: {epitope}")
        print(f"Number of occurrences: {len(subset)}")
        print("Assay Methods used:")
        print(subset['Assay Method'].value_counts())
        print("Qualitative Measures:")
        print(subset['Assay Qualitative Measure'].value_counts())

In [None]:
# Load the consistent natural epitopes file
df = pd.read_excel('consistent_natural_epitopes.xlsx')

# Get initial count
initial_count = len(df)

# Check for completely identical rows
duplicate_mask = df.duplicated(keep=False)
duplicates = df[duplicate_mask]

# Remove duplicates keeping first occurrence
df_unique = df.drop_duplicates()

# Get final count
final_count = len(df_unique)

print(f"Initial number of rows: {initial_count}")
print(f"Number of duplicate rows removed: {initial_count - final_count}")
print(f"Final number of unique rows: {final_count}")

if len(duplicates) > 0:
    print("\nExample of first 5 completely identical duplicate entries:")
    print(duplicates.head())
    
    # Save unique entries to new file
    df_unique.to_excel('unique_epitopes.xlsx', index=False)
    print("\nSaved unique entries to unique_epitopes.xlsx")
else:
    print("\nNo completely identical duplicate entries found in the dataset.")

The dataset contained 175,642 completely identical duplicate rows, which were removed, leaving 384,950 unique rows. The unique entries have been saved to `unique_epitopes.xlsx`. Below are the details:

- {outputs_dict["3635880c"]}
- {outputs_dict["1617e739"]}
- {outputs_dict["9e0bc79a"]}
- {outputs_dict["d36158a1"]}

# Length Filter (4-20 amino acids)

In [None]:
# Load the unique epitopes file
df = pd.read_excel('unique_epitopes.xlsx')

# Filter epitopes based on length (4 to 20 amino acids inclusive)
df_filtered = df[df['Epitope Name'].str.len().between(4, 20)]

# Get initial and final counts
initial_count = len(df)
final_count = len(df_filtered)

# Save the filtered data to a new file
output_file = 'filtered_epitopes.xlsx'
df_filtered.to_excel(output_file, index=False)

print(f"Initial number of rows: {initial_count}")
print(f"Number of rows removed: {initial_count - final_count}")
print(f"Final number of rows: {final_count}")
print("\nSaved filtered data to filtered_epitopes.xlsx")

The filtering process successfully removed epitopes with lengths less than 4 or greater than 20 amino acids. Below are the results:

- {outputs_dict["0b9eda90"]}
- {outputs_dict["e5acbb9e"]}
- {outputs_dict["4a1381c1"]}
- {outputs_dict["5dc18c59"]}

The filtered data has been saved to `filtered_epitopes.xlsx`

In [None]:
# Rename the file
import os

old_name = 'filtered_epitopes.xlsx'
new_name = 'length_filtered_epitopes.xlsx'

# Check if old file exists and rename it
if os.path.exists(old_name):
    os.rename(old_name, new_name)
    print(f"File renamed from {old_name} to {new_name}")
    
    # Verify the number of entries in renamed file
    df = pd.read_excel(new_name)
    print(f"\nVerified: File contains {len(df)} entries")

The file was successfully renamed to `length_filtered_epitopes.xlsx`, and its contents were verified to include 380,842 entries.