# Remove Sequences with Stop Codon

In [21]:
def remove_stop_codon_sequences(input_path, output_path):
    with open(input_path, 'r') as infile:
        lines = infile.readlines()

    output_lines = []
    keep = False
    valid_count = 0

    for i in range(0, len(lines)):
        line = lines[i].strip()

        if line.startswith(">"):
            # Look ahead to get the sequence
            if i + 1 < len(lines):
                sequence = lines[i + 1].strip()
                if "*" not in sequence:
                    output_lines.append(line + "\n")
                    output_lines.append(sequence + "\n")
                    valid_count += 1

    # Write the cleaned output
    with open(output_path, 'w') as outfile:
        outfile.writelines(output_lines)

    print(f"Number of sequences without stop codons: {valid_count}")


# Example usage:
input_file= "/Users/vijaymaranholkar/Desktop/Seq file.txt"
output_file = "/Users/vijaymaranholkar/Downloads/Removed Stop codon seq.txt"
remove_stop_codon_sequences(input_file, output_file)


Number of sequences without stop codons: 1267


# Remove duplicate sequences from TBLASTN Search

In [35]:
def extract_unique_sequences_multiline(input_file, output_file):
    sequences = {}
    with open(input_file, 'r') as f:
        seq_id = None
        seq_lines = []
        for line in f:
            line = line.rstrip('\n')
            if line.startswith('>'):
                if seq_id is not None:
                    full_seq = ''.join(seq_lines)
                    # Store the first ID that this unique sequence appears under
                    if full_seq not in sequences:
                        sequences[full_seq] = seq_id
                seq_id = line
                seq_lines = []
            else:
                seq_lines.append(line)
        # Handle the last sequence in the file
        if seq_id is not None and seq_lines:
            full_seq = ''.join(seq_lines)
            if full_seq not in sequences:
                sequences[full_seq] = seq_id

    with open(output_file, 'w') as out:
        for seq, id_line in sequences.items():
            out.write(f"{id_line}\n")
            # Optionally break sequence every 60 chars for readability
            for i in range(0, len(seq), 60):
                out.write(seq[i:i+60] + "\n")

    print(f"Extraction complete. {len(sequences)} unique sequences written to {output_file}.")

# Example usage:

input_file="/Users/vijaymaranholkar/Desktop/Willson Lab March 15 2025/ IgG binding proteins/Bioinformatics/Very very important only required files June 06 2025/Combine pfam and TBLASTN and Selected seq/Important Final selected set 1 and 2/Combined 104 protein seq and Numbered Selected binders five SPA domains Z domain Filtered duplicate TBLASTN and pfam.txt"
output_file="/Users/vijaymaranholkar/Desktop/Willson Lab March 15 2025/ IgG binding proteins/Bioinformatics/Very very important only required files June 06 2025/Combine pfam and TBLASTN and Selected seq/Important Final selected set 1 and 2/Deduplicated Combined 104 protein seq and Numbered Selected binders five SPA domains Z domain Filtered duplicate TBLASTN and pfam.txt"
extract_unique_sequences_multiline(input_file, output_file)


Extraction complete. 980 unique sequences written to /Users/vijaymaranholkar/Desktop/Willson Lab March 15 2025/ IgG binding proteins/Bioinformatics/Very very important only required files June 06 2025/Combine pfam and TBLASTN and Selected seq/Important Final selected set 1 and 2/Deduplicated Combined 104 protein seq and Numbered Selected binders five SPA domains Z domain Filtered duplicate TBLASTN and pfam.txt.


In [None]:
# Remove duplicate sequences from Pfam Search

In [5]:
def remove_duplicate_sequences(input_path, output_path):
    with open(input_path, 'r') as infile:
        lines = infile.readlines()

    seq_dict = {}
    duplicates_count = 0

    i = 0
    while i < len(lines):
        if lines[i].startswith(">"):
            header = lines[i].strip()
            sequence = lines[i+1].strip()
            if sequence in seq_dict:
                duplicates_count += 1
            else:
                seq_dict[sequence] = header
            i += 2
        else:
            i += 1

    # Write unique sequences to output
    with open(output_path, 'w') as outfile:
        for seq, header in seq_dict.items():
            outfile.write(f"{header}\n{seq}\n")

    print(f"Number of duplicate sequences removed: {duplicates_count}")
    print(f"Number of unique sequences retained: {len(seq_dict)}")

# Example usage:
#input_file= "/Users/vijaymaranholkar/Desktop/TBLASTN search investigation June 03 to 05 2025/June 05 TBLASTN SPAZ and 5 domains including s aureus/seqdump TBLASTN Search SPAZ SPA five domains include s aureus.txt" 
input_file=filename="/Users/vijaymaranholkar/Desktop/Willson Lab March 15 2025/ IgG binding proteins/Bioinformatics/Very important Data July_4 2023/July06/Pfam seq/B Domain pfam/pFam B domain Trial 2/Unaligned pfam B domain.txt"
output_file=filename="/Users/vijaymaranholkar/Desktop/Willson Lab March 15 2025/ IgG binding proteins/Bioinformatics/Very important Data July_4 2023/July06/Pfam seq/B Domain pfam/pFam B domain Trial 2/No duplicate Unaligned pfam B domain.txt"
remove_duplicate_sequences(input_file, output_file)


Number of duplicate sequences removed: 1227
Number of unique sequences retained: 186


# Remove - from a line: Clenaning aligned Pfam file

In [24]:
input_file = "/Users/vijaymaranholkar/Downloads/All aligned B domain seq.txt"  # Change to your input file name
output_file = "/Users/vijaymaranholkar/Downloads/cleaned_sequences.txt" # Change to your desired output file name

with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        if line.startswith(">"):
            outfile.write(line)
        else:
            cleaned_line = line.replace("-", "").rstrip()  # Remove '-' and any trailing spaces/newlines
            outfile.write(cleaned_line + "\n")
