# Negative dataset build

In [34]:
# Import necessary packages
import pandas as pd
import glob
import os

In [35]:
# Set a function to parse a fasta file
def parse_fasta(fasta_file):
    """
    Parses a FASTA file and returns a list of tuples containing accession IDs and sequences.
    
    Parameters:
    fasta_file (str): Path to the FASTA file.
    
    Returns:
    list: A list of tuples, where each tuple contains an accession ID and a sequence.
    """
    sequences = []
    with open(fasta_file, 'r') as file:
        accession_id = None
        sequence = ""
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if accession_id:
                    sequences.append((accession_id, sequence))
                accession_id = line[1:].split()[0]  # Get the first word after '>'
                sequence = ""
            else:
                sequence += line
        if accession_id:
            sequences.append((accession_id, sequence))
    return sequences

# Set a function to create a DataFrame from multiple fasta files
def create_dataframe_from_fastas(fasta_files):
    """
    Creates a DataFrame from multiple FASTA files.
    
    Parameters:
    fasta_files (list): List of paths to FASTA files.
    
    Returns:
    DataFrame: A pandas DataFrame with columns 'accession id', 'sequence', and 'target'.
    """
    all_sequences = []
    for fasta_file in fasta_files:
        sequences = parse_fasta(fasta_file)
        all_sequences.extend(sequences)
    
    df = pd.DataFrame(all_sequences, columns=['accession_id', 'sequence'])
    df['target'] = 'not_endolysin'
    return df

# Specify the directory containing the FASTA files
path = 'curatedDB'

# List all FASTA files in the specified directory, excluding "other.fasta"
fasta_files = glob.glob(os.path.join(path, '*.fasta'))
fasta_files = [f for f in fasta_files if not f.endswith('other.fasta')]

# Create the DataFrame from the FASTA files
df = create_dataframe_from_fastas(fasta_files)

# Display the dimensions of the DataFrame
print("DataFrame dimensions:", df.shape)

# Display the head of the DataFrame
df.head()


DataFrame dimensions: (547911, 3)


Unnamed: 0,accession_id,sequence,target
0,QBR30802.1,MIGMDRHTGQPISGIEHLRQSVADILGTPLLSRRERPEYGSKLRRM...,not_endolysin
1,QBR30574.1,MIGIDRDSGATVDDWLQFVQRATRALTTPLGTRQKRPLYGSLIPTL...,not_endolysin
2,QBR21635.1,MIYKNTAVHFDVNAQVKRSVSANIQFSTQDIGTAKLSFNLTKDGVP...,not_endolysin
3,QBR19749.1,MIYKDTDIHFSINSQIKRSIAANIQFSTQDIDTAKLTFSLTKDGIP...,not_endolysin
4,WP_068457916.1,MLDKPEREEIRYGVTPYGFRRKLYAEALAERMSRAKEVFGVNIDLS...,not_endolysin


In [36]:
# Set a function to check for duplicates in the DataFrame
def check_duplicates(df):
    """
    Checks for duplicates in the DataFrame.
    
    Parameters:
    df (DataFrame): The pandas DataFrame containing the data.
    
    Prints:
    The number of duplicate accession IDs and sequences.
    """
    # Check for duplicate accession IDs
    dup_acc_ids = df.duplicated(subset='accession_id').sum()
    print(f"Number of duplicate accession IDs: {dup_acc_ids}")
    
    # Check for duplicate sequences
    dup_sequences = df.duplicated(subset='sequence').sum()
    print(f"Number of duplicate sequences: {dup_sequences}")

# Check for duplicates
check_duplicates(df)

Number of duplicate accession IDs: 0
Number of duplicate sequences: 374120


In [37]:
# Set a function to remove duplicates from the DataFrame
def remove_duplicates(df):
    """
    Removes duplicate sequences from the DataFrame.
    
    Parameters:
    df (DataFrame): The pandas DataFrame containing the data.
    
    Returns:
    DataFrame: A DataFrame with duplicates removed.
    """
    
    # Remove duplicate sequences
    df = df.drop_duplicates(subset='sequence')
    
    return df

# Remove duplicates
df_cleaned = remove_duplicates(df)

# Display the dimensions of the DataFrame after removing duplicates
print("Cleaned DataFrame dimensions:", df_cleaned.shape)

# Display the head of the cleaned DataFrame
df_cleaned.head()

Cleaned DataFrame dimensions: (173791, 3)


Unnamed: 0,accession_id,sequence,target
0,QBR30802.1,MIGMDRHTGQPISGIEHLRQSVADILGTPLLSRRERPEYGSKLRRM...,not_endolysin
1,QBR30574.1,MIGIDRDSGATVDDWLQFVQRATRALTTPLGTRQKRPLYGSLIPTL...,not_endolysin
2,QBR21635.1,MIYKNTAVHFDVNAQVKRSVSANIQFSTQDIGTAKLSFNLTKDGVP...,not_endolysin
3,QBR19749.1,MIYKDTDIHFSINSQIKRSIAANIQFSTQDIDTAKLTFSLTKDGIP...,not_endolysin
4,WP_068457916.1,MLDKPEREEIRYGVTPYGFRRKLYAEALAERMSRAKEVFGVNIDLS...,not_endolysin


In [38]:
# Check if the 'other.fasta' file was not added to the dataframe

# Check if an accession id entry of the 'other.fasta' file does not exist on the dataframe
entry_value = 'fig|10239.164.peg.1'
if entry_value in df['accession_id'].values:
    print(f"{entry_value} exists in the column.")
else:
    print(f"{entry_value} doesn't exist in the column.")

In [40]:
# Save the DataFrame to a fasta file

# Define a function to format DataFrame rows into FASTA format
def format_fasta(row):
    return f">{row['accession_id']}\n{row['sequence']}\n"

# Apply the function to each row and join the results
fasta_data = ''.join(df_cleaned.apply(format_fasta, axis=1))

# Write the formatted data to a file
with open('negative_dataset.fasta', 'w') as f:
    f.write(fasta_data)

In [39]:
# Save the DataFrame to a CSV file if needed
# df.to_csv('negative_dataset.csv', index=False)