In [1]:
#pip install Bio #Biopython package

Collecting Bio
  Downloading bio-1.7.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.83-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting pooch (from Bio)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.1-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading bio-1.7.1-py3-none-any.whl (280 kB)
   ---------------------------------------- 0.0/281.0 kB ? eta -:--:--
   ---------------------------------------  276.5/281.0 kB 8.6 MB/s eta 0:00:01
   ---------------------------------------- 281.0/281.0 kB 5.8 MB/s eta 0:00:00
Downloading biopython-1.83-cp311-cp311-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2

In [6]:
# v0_5
# Columns moved around

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

# Set input file path and output csv names
input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv"

# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, output_csv, append_str=""):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_append = sequence + append_str
                    
                    # Calculate Protein Analysis Parameters using the appended sequence
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
                    percent_charged = calculate_percent_charged(sequence_with_append)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
                    isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_append = sequence + append_str
            
            # Calculate Protein Analysis Parameters using the appended sequence
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
            percent_charged = calculate_percent_charged(sequence_with_append)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
            isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'Sequence Appended'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append])

    print(f"Sequences exported to '{output_csv}'")

# Usage
parse_fasta_and_export_with_protparam(input_file, output_csv, append_str)


Sequences exported to 'C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv'


In [4]:
# v0_4
# Output both binder sequence and sequence with Fc, but only analyze the sequence + appended.

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

# Set input file path and output csv names
input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv"


# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"


def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, output_csv, append_str=""):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_append = sequence + append_str
                    
                    # Calculate Protein Analysis Parameters using the appended sequence
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
                    percent_charged = calculate_percent_charged(sequence_with_append)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
                    isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence, sequence_with_append))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_append = sequence + append_str
            
            # Calculate Protein Analysis Parameters using the appended sequence
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
            percent_charged = calculate_percent_charged(sequence_with_append)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
            isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence, sequence_with_append))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Abs 0.1% Oxidized', 'Isoelectric Point', 'MW in daltons', 'Original Sequence', 'Appended Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq, seq_with_append in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq, seq_with_append])

    print(f"Sequences exported to '{output_csv}'")

# Usage
parse_fasta_and_export_with_protparam(input_file, output_csv, append_str)


Sequences exported to 'C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv'


In [2]:
# v0_3
# Added ability to appned str (Fc sequence)

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

# Set input file path and output csv names
input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv"

# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"


def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, output_csv, append_str):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines) + append_str
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines) + append_str
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Abs 0.1% Oxidized', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Usage
parse_fasta_and_export_with_protparam(input_file, output_csv, append_str)


Sequences exported to 'C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv'


In [3]:
#v0_2

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

# Set input file path and output csv names
input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv"


def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, output_csv):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Abs 0.1% Oxidized', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Usage
parse_fasta_and_export_with_protparam(input_file, output_csv)


Sequences exported to 'C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv'


In [2]:
# Added Abs 0.1% oxidized by dividing molar extinction coeff by molecular weight (oxidized)

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Abs 0.1% Oxidized', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


Sequences exported to 'output_sequences_with_protparam.csv'


In [55]:
#ChatGPT
# Changed molar extinction coeffient to use BioPython's function and return oxidized

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)




Sequences exported to 'output_sequences_with_protparam.csv'


In [53]:
#ChatGPT
#fixed column order to match Lam's
#working!!!

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient(sequence):
    # Calculate molar extinction coefficient using known values for amino acids
    extinction_coefficient = 0
    aa_extinction = {
        'W': 5500,
        'Y': 1490,
        'C': 125
    }
    for aa, extinction in aa_extinction.items():
        extinction_coefficient += sequence.count(aa) * extinction
    return extinction_coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = analysed_seq.molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = analysed_seq.molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Extinction Coefficient M-1cm-1', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


NameError: name 'analysed_seq' is not defined

In [52]:
#ChatGPT
#Working, but results are slightly different than ExPasy, same as I found previously


from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient(sequence):
    # Calculate molar extinction coefficient using known values for amino acids
    extinction_coefficient = 0
    aa_extinction = {
        'W': 5500,
        'Y': 1490,
        'C': 125
    }
    for aa, extinction in aa_extinction.items():
        extinction_coefficient += sequence.count(aa) * extinction
    return extinction_coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, sequence, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Extinction Coefficient M-1cm-1', 'Isoelectric Point'])
        for seq_id, seq, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI in sequences:
            writer.writerow([seq_id, seq, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)



Sequences exported to 'output_sequences_with_protparam.csv'


In [51]:
#ChatGPT
#it added the DNA sequences back in.....

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient(sequence):
    # Calculate molar extinction coefficient using known values for amino acids
    extinction_coefficient = 0
    aa_extinction = {
        'W': 5500,
        'Y': 1490,
        'C': 125
    }
    for aa, extinction in aa_extinction.items():
        extinction_coefficient += sequence.count(aa) * extinction
    return extinction_coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    with open(file_path, 'r', encoding='utf-8') as file:
        for seq_record in SeqIO.parse(file, "fasta"):
            sequence_id = seq_record.id
            sequence = str(seq_record.seq)
            
            # Validate sequence characters (optional)
            valid_sequence = True
            for char in sequence:
                if char not in 'ACDEFGHIKLMNPQRSTVWY':
                    valid_sequence = False
                    break
            
            if not valid_sequence:
                print(f"Skipping sequence '{sequence_id}' due to invalid characters.")
                continue
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point))
    
    # Export to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Extinction Coefficient M-1cm-1', 'Isoelectric Point'])
        for seq_id, seq, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI in sequences:
            writer.writerow([seq_id, seq, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


Skipping sequence 'SEQ66_CL752' due to invalid characters.
Sequences exported to 'output_sequences_with_protparam.csv'


In [48]:
#ChatGPT
#Added pI

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient(sequence):
    # Calculate molar extinction coefficient using known values for amino acids
    extinction_coefficient = 0
    aa_extinction = {
        'W': 5500,
        'Y': 1490,
        'C': 125
    }
    for aa, extinction in aa_extinction.items():
        extinction_coefficient += sequence.count(aa) * extinction
    return extinction_coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    with open(file_path, 'r', encoding='utf-8') as file:
        for seq_record in SeqIO.parse(file, "fasta"):
            sequence_id = seq_record.id
            sequence = str(seq_record.seq)
            
            # Validate sequence characters (optional)
            valid_sequence = True
            for char in sequence:
                if char not in 'ACDEFGHIKLMNPQRSTVWY':
                    valid_sequence = False
                    break
            
            if not valid_sequence:
                print(f"Skipping sequence '{sequence_id}' due to invalid characters.")
                continue
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, charge_at_pH7, percent_charged, percent_hydrophobic, extinction_coefficient))
    
    # Export to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', 'Charge at pH 7', '% Charged AAs', '% Hydrophobic AAs', 'Molar Extinction Coefficient'])
        for seq_id, seq, charge_pH7, percent_charged, percent_hydrophobic, extinction_coefficient in sequences:
            writer.writerow([seq_id, seq, charge_pH7, percent_charged, percent_hydrophobic, extinction_coefficient])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


Skipping sequence 'SEQ66_CL752' due to invalid characters.
Sequences exported to 'output_sequences_with_protparam.csv'


In [44]:
#ChatGPT
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    with open(file_path, 'r', encoding='utf-8') as file:
        for seq_record in SeqIO.parse(file, "fasta"):
            sequence_id = seq_record.id
            sequence = str(seq_record.seq)
            
            # Validate sequence characters
            valid_sequence = True
            for char in sequence:
                if char not in 'ACDEFGHIKLMNPQRSTVWY':
                    valid_sequence = False
                    break
            
            if not valid_sequence:
                print(f"Skipping sequence '{sequence_id}' due to invalid characters.")
                continue
            
            # Calculate Protein Analysis Parameters using BioPython's ProtParam
            analysed_seq = ProteinAnalysis(sequence)
            pI = analysed_seq.isoelectric_point()
            molecular_weight = analysed_seq.molecular_weight()
            cysteine_count = sequence.count('C')  # Count of cysteine residues
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, pI, molecular_weight, cysteine_count))
    
    # Export to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', 'Isoelectric Point (pI)', 'Molecular Weight (Da)', 'Cysteine Count'])
        for seq_id, seq, pI, mw, cysteines in sequences:
            writer.writerow([seq_id, seq, pI, mw, cysteines])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


Skipping sequence 'SEQ66_CL752' due to invalid characters.
Sequences exported to 'output_sequences_with_protparam.csv'


In [45]:
#ChatGPT
#broken

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    with open(file_path, 'r', encoding='utf-8') as file:
        for seq_record in SeqIO.parse(file, "fasta"):
            sequence_id = seq_record.id
            sequence = str(seq_record.seq)
            
            # Calculate Protein Analysis Parameters using BioPython's ProtParam
            analysed_seq = ProteinAnalysis(sequence)
            pI = analysed_seq.isoelectric_point()
            molecular_weight = analysed_seq.molecular_weight()
            cysteine_count = sequence.count('C')  # Count of cysteine residues
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, pI, molecular_weight, cysteine_count))
    
    # Export to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', 'Isoelectric Point (pI)', 'Molecular Weight (Da)', 'Cysteine Count'])
        for seq_id, seq, pI, mw, cysteines in sequences:
            writer.writerow([seq_id, seq, pI, mw, cysteines])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


ValueError: ''-'' is not a valid unambiguous letter for protein

In [39]:
#ChatGPT script to parse fasta first 2 lines, then get protparam pI, MW, and abs 0.1% oxidized

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    for seq_record in SeqIO.parse(file_path, "fasta"):
        sequence_id = seq_record.id
        sequence = str(seq_record.seq)
        
        # Calculate Protein Analysis Parameters using BioPython's ProtParam
        analysed_seq = ProteinAnalysis(sequence)
        pI = analysed_seq.isoelectric_point()
        molecular_weight = analysed_seq.molecular_weight()
        cysteine_count = sequence.count('C')  # Count of cysteine residues
        
        # Append data to sequences list
        sequences.append((sequence_id, sequence, pI, molecular_weight, cysteine_count))
    
    # Export to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', 'Isoelectric Point (pI)', 'Molecular Weight (Da)', 'Cysteine Count'])
        for seq_id, seq, pI, mw, cysteines in sequences:
            writer.writerow([seq_id, seq, pI, mw, cysteines])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


UnicodeEncodeError: 'ascii' codec can't encode characters in position 563-565: ordinal not in range(128)

In [38]:
#ChatGPT Working script to parse and take only first 2 lines of the fasta sequence

from Bio import SeqIO
import csv

def parse_fasta_and_export(file_path, output_csv):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence'])
        for seq_id, sequence in sequences:
            writer.writerow([seq_id, sequence])

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences.csv"
parse_fasta_and_export(file_path, output_csv)
print(f"Sequences exported to '{output_csv}'")


Sequences exported to 'output_sequences.csv'


In [37]:
#ChatGPT Working script, just takes the first 2 lines of the fasta sequence

from Bio import SeqIO
def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


>SEQ1_CBLib1-R4-receptor-02
QVQLVESGGGLVQAGGSLRLSCTPPGRSFNGYALGWFRQAPGRERELVAAINWSEGHTYYSDSAKGRFTISKDNAMNTVYLQMNSLKPEDTAVYYCAARAMFSWDPERYGYWGQGTQVTVSS
122
>SEQ2_CBLib1-R4-receptor-04
QVQLVESGGGLVQAGGSLRLSCAPSGRTFNNYALGWIRQAPGKEREFVAAINWSEGHTYYADSAKGRFTISRDNAKNTMYLQMNSLKPEDTGVYYCAARAMFSWDPERYGYWGQGTQVTVSS
122
>SEQ3_CBLib1-R4-receptor-30
QVQLVESGGGLVQAGGSLRLSCVTSGRTFGRYAMGWFRQAPGKEREFVAAINWSEGHTYYSDSAKGRFTISRDSTKNTMHLQMNSLKPEDTAVYYCAARAMFSWDPERYGYWGQGTQVTVSS
122
>SEQ4_CBLib1-R4-receptor-33
QVQLQESGGGLVQAGGSLRLSCTPPGRTFNGYALGWFRQAPGRERELVAAINWSEGHTYYEDSAKGRFTISKDNAMNTVYLQMNSLKPEDTAVYYCAARAMFSWDPKRYDYWGQGTQVTVSS
122
>SEQ5_CBLib1-R4-receptor-38
QVQLVESGGGLVQAGGSLRLSCTPPGRTFNGYALGWFRQAPGRERELVAAINWSEGHTYYEDSAKGRFTISKDNAMNTVYLQMNSLKPEDTAVYYCAARAIFSWDPERYNYWGQGTLVTVSS
122
>SEQ6_CBLib1-R4-receptor-40
QVQLVESGGGLVRAGGSLRLSCAPSGRTFNSYALGWFRQAPGKEREFVAAINWSEGHTYYSDSAKGRFTISRDSTKNTMHLQMNSLKPEDTAVYYCAARAMFSWDPERYGYWGQGTQVTVSS
122
>SEQ7_CBLib3-4-R2-Ab-42
QVQLQESGGGLVQAGGSLRLSCAASGRSFNGYALGWFRQAPGRERE

In [34]:
def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_break_count = 0  # Initialize line break counter
        
        for line in file:
            if line == '\n':
                line_break_count += 1
                if line_break_count >= 1:
                    # Two consecutive line breaks detected, skip the current sequence
                    sequence_id = None
                    sequence_lines = []
                    line_break_count = 0  # Reset the counter
                    continue
            else:
                line_break_count = 0  # Reset the counter if it's not a line break
            
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


In [36]:
def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        previous_line_blank = False  # Flag to track if the previous line was blank
        
        for line in file:
            line = line.strip()
            print(line)
            
            if not line:
                if previous_line_blank:
                    # Two consecutive blank lines detected, skip the current sequence
                    sequence_id = None
                    sequence_lines = []
                    previous_line_blank = False  # Reset the flag
                    continue
                else:
                    previous_line_blank = True  # Set the flag for the first blank line
            else:
                previous_line_blank = False  # Reset the flag if the current line is not blank
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(line)
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


TL1A Nanobody Selected for Jonathan
2024-05-08

>SEQ1_CBLib1-R4-receptor-02
QVQLVESGGGLVQAGGSLRLSCTPPGRSFNGYALGWFRQAPGRERELVAAINWSEGH
TYYSDSAKGRFTISKDNAMNTVYLQMNSLKPEDTAVYYCAARAMFSWDPERYGYWGQGTQVTVSS

GGGACTGCTGCTGCTGTGGCTGACAGATGCCAGATGCCAA GTA CAG TTG GTA GAA TCA GGG GGC GGG TTG GTG CAG GCT GGG GGG TCC CTC CGA TTG AGT TGC ACA CCA CCC GGT AGA TCC TTC AAC GGA TAC GCC CTG GGG TGG TTC CGC CAA GCA CCA GGC AGA GAA CGG GAA CTG GTA GCA GCG ATT AAT TGG TCC GAG GGA CAT ACT TAC TAC AGT GAC AGC GCG AAG GGA CGA TTC ACT ATC TCC AAA GAT AAC GCC ATG AAC ACT GTC TAT TTG CAG ATG AAT AGT TTG AAG CCG GAA GAC ACC GCT GTC TAT TAT TGC GCG GCT AGG GCG ATG TTC TCC TGG GAC CCA GAG CGG TAC GGA TAC TGG GGA CAA GGA ACT CAA GTC ACG GTA AGC AGTGAACCCAAGAGCAGCGACAAGACCCACACCTGTCCTCC

>SEQ2_CBLib1-R4-receptor-04
QVQLVESGGGLVQAGGSLRLSCAPSGRTFNNYALGWIRQAPGKEREFVAAINWSEGH
TYYADSAKGRFTISRDNAKNTMYLQMNSLKPEDTGVYYCAARAMFSWDPERYGYWGQGTQVTVSS

GGGACTGCTGCTGCTGTGGCTGACAGATGCCAGATGCCAG GTG CAG TTG GTT GAA TCC GGG GGT GGA TTG G

In [31]:
# view raw text file
Text_File_Import = open(r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt")
Text_lines = Text_File_Import.readlines()

for line in Text_lines:
    User_Inputs = line.split(' ')


print(Text_lines)
print(User_Inputs)

['TL1A Nanobody Selected for Jonathan\n', '2024-05-08\n', '\n', '>SEQ1_CBLib1-R4-receptor-02\n', 'QVQLVESGGGLVQAGGSLRLSCTPPGRSFNGYALGWFRQAPGRERELVAAINWSEGH\n', 'TYYSDSAKGRFTISKDNAMNTVYLQMNSLKPEDTAVYYCAARAMFSWDPERYGYWGQGTQVTVSS\n', '\n', 'GGGACTGCTGCTGCTGTGGCTGACAGATGCCAGATGCCAA GTA CAG TTG GTA GAA TCA GGG GGC GGG TTG GTG CAG GCT GGG GGG TCC CTC CGA TTG AGT TGC ACA CCA CCC GGT AGA TCC TTC AAC GGA TAC GCC CTG GGG TGG TTC CGC CAA GCA CCA GGC AGA GAA CGG GAA CTG GTA GCA GCG ATT AAT TGG TCC GAG GGA CAT ACT TAC TAC AGT GAC AGC GCG AAG GGA CGA TTC ACT ATC TCC AAA GAT AAC GCC ATG AAC ACT GTC TAT TTG CAG ATG AAT AGT TTG AAG CCG GAA GAC ACC GCT GTC TAT TAT TGC GCG GCT AGG GCG ATG TTC TCC TGG GAC CCA GAG CGG TAC GGA TAC TGG GGA CAA GGA ACT CAA GTC ACG GTA AGC AGTGAACCCAAGAGCAGCGACAAGACCCACACCTGTCCTCC\n', '\n', '>SEQ2_CBLib1-R4-receptor-04\n', 'QVQLVESGGGLVQAGGSLRLSCAPSGRTFNNYALGWIRQAPGKEREFVAAINWSEGH\n', 'TYYADSAKGRFTISRDNAKNTMYLQMNSLKPEDTGVYYCAARAMFSWDPERYGYWGQGTQVTVSS\n', '\n', 'GGGACTGCTGCTGCT

In [28]:
from Bio import SeqIO

def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        invalid_sequence = False  # Initialize invalid sequence flag
        
        for line in file:
            line = line.strip()
            
            if not line:
                # Skip blank lines
                continue
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines and not invalid_sequence:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                invalid_sequence = False  # Reset invalid sequence flag
            else:
                # Check for spaces in the sequence line
                if ' ' in line:
                    invalid_sequence = True
                    sequence_id = None
                    sequence_lines = []
                    continue
                
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines and not invalid_sequence:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


In [27]:
#chatgpt 3, skip if 2 spaces, doesn't work
from Bio import SeqIO

def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        space_count = 0  # Initialize space counter
        
        for line in file:
            line = line.strip()
            
            if not line:
                # Skip blank lines
                continue
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines and space_count < 2:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                space_count = 0  # Reset space counter
            else:
                # Increment space counter if a space is detected
                space_count += line.count(' ')
                if space_count >= 2:
                    sequence_id = None
                    sequence_lines = []
                    continue
                
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines and space_count < 2:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))



In [26]:
#chatgpt 2, skip if 2 linebreaks detected
from Bio import SeqIO

def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_break_count = 0  # Initialize line break counter
        
        for line in file:
            line = line.strip()
            
            if not line:
                # Skip blank lines
                continue
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines and line_break_count < 1:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_break_count = 0  # Reset line break counter
            else:
                # Increment line break counter if a line break is detected
                if '\n' in line:
                    line_break_count += 1
                    if line_break_count >= 1:
                        sequence_id = None
                        sequence_lines = []
                        continue
                
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines and line_break_count < 2:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


>SEQ1_CBLib1-R4-receptor-02
QVQLVESGGGLVQAGGSLRLSCTPPGRSFNGYALGWFRQAPGRERELVAAINWSEGHTYYSDSAKGRFTISKDNAMNTVYLQMNSLKPEDTAVYYCAARAMFSWDPERYGYWGQGTQVTVSSGGGACTGCTGCTGCTGTGGCTGACAGATGCCAGATGCCAA GTA CAG TTG GTA GAA TCA GGG GGC GGG TTG GTG CAG GCT GGG GGG TCC CTC CGA TTG AGT TGC ACA CCA CCC GGT AGA TCC TTC AAC GGA TAC GCC CTG GGG TGG TTC CGC CAA GCA CCA GGC AGA GAA CGG GAA CTG GTA GCA GCG ATT AAT TGG TCC GAG GGA CAT ACT TAC TAC AGT GAC AGC GCG AAG GGA CGA TTC ACT ATC TCC AAA GAT AAC GCC ATG AAC ACT GTC TAT TTG CAG ATG AAT AGT TTG AAG CCG GAA GAC ACC GCT GTC TAT TAT TGC GCG GCT AGG GCG ATG TTC TCC TGG GAC CCA GAG CGG TAC GGA TAC TGG GGA CAA GGA ACT CAA GTC ACG GTA AGC AGTGAACCCAAGAGCAGCGACAAGACCCACACCTGTCCTCC
684
>SEQ2_CBLib1-R4-receptor-04
QVQLVESGGGLVQAGGSLRLSCAPSGRTFNNYALGWIRQAPGKEREFVAAINWSEGHTYYADSAKGRFTISRDNAKNTMYLQMNSLKPEDTGVYYCAARAMFSWDPERYGYWGQGTQVTVSSGGGACTGCTGCTGCTGTGGCTGACAGATGCCAGATGCCAG GTG CAG TTG GTT GAA TCC GGG GGT GGA TTG GTT CAG GCG GGG GGT AGT CTT CGA CTT TCT TGT GCG CCA 

In [16]:
from Bio import SeqIO

input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"

output_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output.txt"

fasta_sequences = SeqIO.parse(open(input_file),'fasta')

with open(output_file) as out_file:
    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        #new_sequence = some_function(sequence)
        #write_fasta(out_file)

UnicodeEncodeError: 'ascii' codec can't encode characters in position 563-565: ordinal not in range(128)

In [21]:
#ChatGPT

from Bio import SeqIO

def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        valid_sequence = True  # Initialize valid_sequence at the beginning
        
        for line in file:
            line = line.strip()
            
            if not line:
                # Skip blank lines
                continue
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines and valid_sequence:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                valid_sequence = True  # Reset valid sequence flag
            else:
                # If a line break is detected, skip this sequence
                if '\n' in line:
                    sequence_id = None
                    sequence_lines = []
                    valid_sequence = False
                    continue
                
                if valid_sequence:
                    sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines and valid_sequence:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


>SEQ1_CBLib1-R4-receptor-02
QVQLVESGGGLVQAGGSLRLSCTPPGRSFNGYALGWFRQAPGRERELVAAINWSEGHTYYSDSAKGRFTISKDNAMNTVYLQMNSLKPEDTAVYYCAARAMFSWDPERYGYWGQGTQVTVSSGGGACTGCTGCTGCTGTGGCTGACAGATGCCAGATGCCAA GTA CAG TTG GTA GAA TCA GGG GGC GGG TTG GTG CAG GCT GGG GGG TCC CTC CGA TTG AGT TGC ACA CCA CCC GGT AGA TCC TTC AAC GGA TAC GCC CTG GGG TGG TTC CGC CAA GCA CCA GGC AGA GAA CGG GAA CTG GTA GCA GCG ATT AAT TGG TCC GAG GGA CAT ACT TAC TAC AGT GAC AGC GCG AAG GGA CGA TTC ACT ATC TCC AAA GAT AAC GCC ATG AAC ACT GTC TAT TTG CAG ATG AAT AGT TTG AAG CCG GAA GAC ACC GCT GTC TAT TAT TGC GCG GCT AGG GCG ATG TTC TCC TGG GAC CCA GAG CGG TAC GGA TAC TGG GGA CAA GGA ACT CAA GTC ACG GTA AGC AGTGAACCCAAGAGCAGCGACAAGACCCACACCTGTCCTCC
684
>SEQ2_CBLib1-R4-receptor-04
QVQLVESGGGLVQAGGSLRLSCAPSGRTFNNYALGWIRQAPGKEREFVAAINWSEGHTYYADSAKGRFTISRDNAKNTMYLQMNSLKPEDTGVYYCAARAMFSWDPERYGYWGQGTQVTVSSGGGACTGCTGCTGCTGTGGCTGACAGATGCCAGATGCCAG GTG CAG TTG GTT GAA TCC GGG GGT GGA TTG GTT CAG GCG GGG GGT AGT CTT CGA CTT TCT TGT GCG CCA 

In [17]:
>>> from Bio import SeqIO
>>> for seq_record in SeqIO.parse(r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt", "fasta"):
...     print(seq_record.id)
...     print(repr(seq_record.seq))
...     print(len(seq_record))

SEQ1_CBLib1-R4-receptor-02
Seq('QVQLVESGGGLVQAGGSLRLSCTPPGRSFNGYALGWFRQAPGRERELVAAINWS...TCC')
563
SEQ2_CBLib1-R4-receptor-04
Seq('QVQLVESGGGLVQAGGSLRLSCAPSGRTFNNYALGWIRQAPGKEREFVAAINWS...TCC')
563
SEQ3_CBLib1-R4-receptor-30
Seq('QVQLVESGGGLVQAGGSLRLSCVTSGRTFGRYAMGWFRQAPGKEREFVAAINWS...TCC')
563
SEQ4_CBLib1-R4-receptor-33
Seq('QVQLQESGGGLVQAGGSLRLSCTPPGRTFNGYALGWFRQAPGRERELVAAINWS...TCC')
563
SEQ5_CBLib1-R4-receptor-38
Seq('QVQLVESGGGLVQAGGSLRLSCTPPGRTFNGYALGWFRQAPGRERELVAAINWS...TCC')
563
SEQ6_CBLib1-R4-receptor-40
Seq('QVQLVESGGGLVRAGGSLRLSCAPSGRTFNSYALGWFRQAPGKEREFVAAINWS...TCC')
563
SEQ7_CBLib3-4-R2-Ab-42
Seq('QVQLQESGGGLVQAGGSLRLSCAASGRSFNGYALGWFRQAPGRERELVAAINWS...TCC')
563
SEQ8_CBLib1-R4-Ab-05
Seq('QVQLVDSGGGLVQAGGSLRLSCTPPARTFNGYALGWFRQAPGRERELVAAINWS...TCC')
563
SEQ9_CBLib1-R4-Ab-24
Seq('QVQLVESGGGLVQAGGSLRLSCAASGRTFSSYAMGWFRQAPGRERELVAAINWS...TCC')
563
SEQ10_CBLib1-R4-Ab-25
Seq('QVQLVESGGGLVQAGGSLRLSCIPPGRTFNGYALGWFRQAPGREREFVAAINWS...TCC')
563
SEQ11_CBLib1-R4-Ab-31
Seq('QVQL

UnicodeEncodeError: 'ascii' codec can't encode characters in position 563-565: ordinal not in range(128)