# Positive dataset build

In [1]:
# Import necessary packages
import pandas as pd

In [2]:
# Set a function to parse a fasta file
def parse_fasta(fasta_file):
    """
    Parses a FASTA file and returns a list of tuples containing accession IDs and sequences.
    
    Parameters:
    fasta_file (str): Path to the FASTA file.
    
    Returns:
    list: A list of tuples, where each tuple contains an accession ID and a sequence.
    """
    sequences = []
    with open(fasta_file, 'r') as file:
        accession_id = None
        sequence = ""
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if accession_id:
                    sequences.append((accession_id, sequence))
                accession_id = line[1:].split()[0]  # Get the first word after '>'
                sequence = ""
            else:
                sequence += line
        if accession_id:
            sequences.append((accession_id, sequence))
    return sequences

# Set a function to create a DataFrame 
def create_dataframe(sequences):
    """
    Creates a DataFrame from a list of sequences.
    
    Parameters:
    sequences (list): A list of tuples, where each tuple contains an accession ID and a sequence.
    
    Returns:
    DataFrame: A pandas DataFrame with columns 'accession id', 'sequence', and 'target'.
    """
    df = pd.DataFrame(sequences, columns=['accession_id', 'sequence'])
    df['target'] = 'endolysin'
    return df

# Specify the filename of your FASTA file
fasta_file_name = 'endolysin_ncbi_raw.fasta'

# Parse the FASTA file
sequences = parse_fasta(fasta_file_name)

# Create the DataFrame
df = create_dataframe(sequences)

# Display the dimensions of the DataFrame
print("DataFrame dimensions:", df.shape)

# Display the head of the DataFrame
df.head()


DataFrame dimensions: (9495, 3)


Unnamed: 0,accession_id,sequence,target
0,XBA09022.1,MATSKNMKAFLDMLAYSEGTDNGRQKTNNHGYDVIVGGSLFTDYSD...,endolysin
1,XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin
2,XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin
3,XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin
4,WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin


In [3]:
# Set a function to check for duplicates in the DataFrame
def check_duplicates(df):
    """
    Checks for duplicates in the DataFrame.
    
    Parameters:
    df (DataFrame): The pandas DataFrame containing the data.
    
    Prints:
    The number of duplicate accession IDs and sequences.
    """
    # Check for duplicate accession IDs
    dup_acc_ids = df.duplicated(subset='accession_id').sum()
    print(f"Number of duplicate accession IDs: {dup_acc_ids}")
    
    # Check for duplicate sequences
    dup_sequences = df.duplicated(subset='sequence').sum()
    print(f"Number of duplicate sequences: {dup_sequences}")

# Check for duplicates
check_duplicates(df)

Number of duplicate accession IDs: 0
Number of duplicate sequences: 2992


In [4]:
# Set a function to remove duplicates from the DataFrame
def remove_duplicates(df):
    """
    Removes duplicate sequences from the DataFrame.
    
    Parameters:
    df (DataFrame): The pandas DataFrame containing the data.
    
    Returns:
    DataFrame: A DataFrame with duplicates removed.
    """
    
    # Remove duplicate sequences
    df = df.drop_duplicates(subset='sequence')
    
    return df

# Remove duplicates
df_cleaned = remove_duplicates(df)

# Display the dimensions of the DataFrame after removing duplicates
print("Cleaned DataFrame dimensions:", df_cleaned.shape)

# Display the head of the cleaned DataFrame
df_cleaned.head()

Cleaned DataFrame dimensions: (6503, 3)


Unnamed: 0,accession_id,sequence,target
0,XBA09022.1,MATSKNMKAFLDMLAYSEGTDNGRQKTNNHGYDVIVGGSLFTDYSD...,endolysin
1,XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin
2,XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin
3,XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin
4,WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin


In [5]:
# Save the DataFrame to a fasta file

# Define a function to format DataFrame rows into FASTA format
def format_fasta(row):
    return f">{row['accession_id']}\n{row['sequence']}\n"

# Apply the function to each row and join the results
fasta_data = ''.join(df_cleaned.apply(format_fasta, axis=1))

# Write the formatted data to a file
with open('positive_dataset.fasta', 'w') as f:
    f.write(fasta_data)

In [40]:
# Save the DataFrame to a CSV file if needed
# df_cleaned.to_csv('positive_dataset.csv', index=False)