In [None]:
#pip install Bio #Biopython package

In [5]:
# v 1.3
# Updated to name the CSV file after the sequence name if only one sequence is used, or after the parent directory if multiple sequences are in a single FASTA file or multiple FASTA files are found. Added date and time to the CSV name.
# Added user input if no fasta file found.


import os
import time
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

Fc_sequence = ("EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC"
              "SVMHEALHNHYTQKSLSLSPGK")

def calculate_charge_at_pH7(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    return analysed_seq.charge_at_pH(7.0)

def calculate_percent_charged(sequence):
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    return (charged_aa / total_aa) * 100

def calculate_percent_hydrophobic(sequence):
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    return (hydrophobic_aa / total_aa) * 100

def calculate_molar_extinction_coefficient_oxidized(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()
    return epsilon_prot[1]

def calculate_abs_0_1_percent_oxidized(sequence):
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    return extinction_coefficient / molecular_weight

def parse_fasta_and_export_with_protparam(input_files, c_terminus="", n_terminus="", multimer_count=1):
    sequences = []
    date_time = time.strftime("%Y%m%d_%H%M%S")
    single_sequence_name = None
    
    for input_file in input_files:
        with open(input_file, 'r') as file:
            sequence_id = None
            sequence_lines = []
            sequence_count = 0
            
            for line in file:
                line = line.strip()
                
                if line.startswith('>'):
                    sequence_count += 1
                    if sequence_id is not None and sequence_lines:
                        sequence = ''.join(sequence_lines)
                        if c_terminus == "Fc":
                            c_terminus_seq = Fc_sequence
                            comment = "Fc added"
                        else:
                            c_terminus_seq = c_terminus
                            comment = ""
                        sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
                        
                        charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                        percent_charged = calculate_percent_charged(sequence_with_modifications)
                        percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                        extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                        abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                        isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                        molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                        
                        sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications, comment))
                    
                    sequence_id = line[1:]
                    sequence_lines = []
                else:
                    sequence_lines.append(line)
            
            if sequence_id is not None and sequence_lines:
                sequence = ''.join(sequence_lines)
                if c_terminus == "Fc":
                    c_terminus_seq = Fc_sequence
                    comment = "Fc added"
                else:
                    c_terminus_seq = c_terminus
                    comment = ""
                sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
                
                charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                percent_charged = calculate_percent_charged(sequence_with_modifications)
                percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                
                sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications, comment))
                
            if sequence_count == 1:
                single_sequence_name = sequence_id

    output_dir = os.path.dirname(input_files[0])
    
    if len(input_files) == 1 and single_sequence_name:
        # If only one sequence is processed, use the sequence name
        output_csv = os.path.join(output_dir, f'{single_sequence_name}_{date_time}_ProtParam_Results.csv')
    else:
        # If multiple sequences or files, use the parent directory name
        parent_dir_name = os.path.basename(output_dir)
        output_csv = os.path.join(output_dir, f'{parent_dir_name}_{date_time}_ProtParam_Results.csv')
    
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'C-Terminus', 'N-Terminus', 'Multimer Count', 'Analyzed Sequence', 'Comments'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term, n_term, multimer, seq_with_modifications, comment in sequences:
            c_term_display = c_term if c_term else "No C-terminus added"
            n_term_display = n_term if n_term else "No N-terminus added"
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term_display, n_term_display, multimer, seq_with_modifications, comment])
    
    print(f"Sequences exported to '{output_csv}'")
    input("Press the <ENTER> key to continue...")

def find_fasta_files(root_folder):
    fasta_files = []
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                fasta_files.append(os.path.join(root, file))
    return fasta_files

print("The directory must contain .fasta files to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_files = find_fasta_files(root_folder)

if input_files:
    print(f"Found FASTA files: {input_files}")
    
    multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
    multimer_count = int(multimer_count_input) if multimer_count_input else 1
    n_terminus = input("Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): ")
    c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): ")
    
    parse_fasta_and_export_with_protparam(input_files, c_terminus, n_terminus, multimer_count)
else:
    print("No FASTA files found in the specified root folder.")
    input("Press the <ENTER> key to continue...")
    



The directory must contain .fasta files to be used.
Please enter the path to the root folder: C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\WuXi AttoFc Sequence FASTA Analysis - Copy
Found FASTA files: ['C:\\Users\\JoelTencer\\OneDrive - Attovia\\ATTO-005\\WuXi AttoFc Sequence FASTA Analysis - Copy\\WBP71457 1-17 Sequence Confirmation.fasta', 'C:\\Users\\JoelTencer\\OneDrive - Attovia\\ATTO-005\\WuXi AttoFc Sequence FASTA Analysis - Copy\\WBP71634 1-60 Sequence Confirmation.fasta']
Enter the multimer count (Example: dimer = 2): 2
Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): 
Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): Fc
Sequences exported to 'C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\WuXi AttoFc Sequence FASTA Analysis - Copy\WuXi AttoFc Sequence FASTA Analysis - Copy_20240820_142823_ProtParam_Results.csv'
Press the <ENTER> key to continue...


In [3]:
# v 1.1
# Updated csv naming
# If one fasta file, name csv after that sequence and add date
# If multiple sequences add parent directory name and date
# Updated to name the CSV file after the sequence name if only one sequence is used, or after the parent directory if multiple sequences. Added date and time to the CSV name.

import os
import time
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

Fc_sequence = ("EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC"
              "SVMHEALHNHYTQKSLSLSPGK")

def calculate_charge_at_pH7(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    return analysed_seq.charge_at_pH(7.0)

def calculate_percent_charged(sequence):
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    return (charged_aa / total_aa) * 100

def calculate_percent_hydrophobic(sequence):
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    return (hydrophobic_aa / total_aa) * 100

def calculate_molar_extinction_coefficient_oxidized(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()
    return epsilon_prot[1]

def calculate_abs_0_1_percent_oxidized(sequence):
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    return extinction_coefficient / molecular_weight

def parse_fasta_and_export_with_protparam(input_files, c_terminus="", n_terminus="", multimer_count=1):
    sequences = []
    date_time = time.strftime("%Y%m%d_%H%M%S")
    
    for input_file in input_files:
        with open(input_file, 'r') as file:
            sequence_id = None
            sequence_lines = []
            
            for line in file:
                line = line.strip()
                
                if line.startswith('>'):
                    if sequence_id is not None and sequence_lines:
                        sequence = ''.join(sequence_lines)
                        if c_terminus == "Fc":
                            c_terminus_seq = Fc_sequence
                            comment = "Fc added"
                        else:
                            c_terminus_seq = c_terminus
                            comment = ""
                        sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
                        
                        charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                        percent_charged = calculate_percent_charged(sequence_with_modifications)
                        percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                        extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                        abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                        isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                        molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                        
                        sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications, comment))
                    
                    sequence_id = line[1:]
                    sequence_lines = []
                else:
                    sequence_lines.append(line)
            
            if sequence_id is not None and sequence_lines:
                sequence = ''.join(sequence_lines)
                if c_terminus == "Fc":
                    c_terminus_seq = Fc_sequence
                    comment = "Fc added"
                else:
                    c_terminus_seq = c_terminus
                    comment = ""
                sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
                
                charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                percent_charged = calculate_percent_charged(sequence_with_modifications)
                percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                
                sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications, comment))
    
    output_dir = os.path.dirname(input_files[0])
    if len(input_files) == 1:
        # If only one file, use the sequence name
        output_csv = os.path.join(output_dir, f'{sequences[0][0]}_ProtParam_Results_{date_time}.csv')
    else:
        # If multiple files, use the parent directory name
        parent_dir_name = os.path.basename(output_dir)
        output_csv = os.path.join(output_dir, f'{parent_dir_name}_ProtParam_Results_{date_time}.csv')
    
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'C-Terminus', 'N-Terminus', 'Multimer Count', 'Analyzed Sequence', 'Comments'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term, n_term, multimer, seq_with_modifications, comment in sequences:
            c_term_display = c_term if c_term else "No C-terminus added"
            n_term_display = n_term if n_term else "No N-terminus added"
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term_display, n_term_display, multimer, seq_with_modifications, comment])
    
    print(f"Sequences exported to '{output_csv}'")
    input("Press the <ENTER> key to continue...")

def find_fasta_files(root_folder):
    fasta_files = []
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                fasta_files.append(os.path.join(root, file))
    return fasta_files

print("The directory must contain .fasta files to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_files = find_fasta_files(root_folder)

if input_files:
    print(f"Found FASTA files: {input_files}")
    
    multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
    multimer_count = int(multimer_count_input) if multimer_count_input else 1
    n_terminus = input("Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): ")
    c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): ")
    
    parse_fasta_and_export_with_protparam(input_files, c_terminus, n_terminus, multimer_count)
else:
    print("No FASTA files found in the specified root folder.")


The directory must contain .fasta files to be used.
Please enter the path to the root folder: C:\Users\JoelTencer\OneDrive - Attovia\ATTO-007\All Reference Antibody Sequences
Found FASTA files: ['C:\\Users\\JoelTencer\\OneDrive - Attovia\\ATTO-007\\All Reference Antibody Sequences\\All Reference Antibodies.fasta']
Enter the multimer count (Example: dimer = 2): 2
Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): 
Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): Fc
Sequences exported to 'C:\Users\JoelTencer\OneDrive - Attovia\ATTO-007\All Reference Antibody Sequences\Vonlerolizumab_ProtParam_Results_20240816_120421.csv'
Press the <ENTER> key to continue...


In [None]:
# v 1.0
# Added ability to analyze one fasta file path and loop through the directory for multiple fasta files
# If Fc is added by user input, add comment that this has been added

import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

Fc_sequence = ("EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC"
              "SVMHEALHNHYTQKSLSLSPGK")

def calculate_charge_at_pH7(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    return analysed_seq.charge_at_pH(7.0)

def calculate_percent_charged(sequence):
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    return (charged_aa / total_aa) * 100

def calculate_percent_hydrophobic(sequence):
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    return (hydrophobic_aa / total_aa) * 100

def calculate_molar_extinction_coefficient_oxidized(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()
    return epsilon_prot[1]

def calculate_abs_0_1_percent_oxidized(sequence):
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    return extinction_coefficient / molecular_weight

def parse_fasta_and_export_with_protparam(input_files, c_terminus="", n_terminus="", multimer_count=1):
    sequences = []
    
    for input_file in input_files:
        with open(input_file, 'r') as file:
            sequence_id = None
            sequence_lines = []
            
            for line in file:
                line = line.strip()
                
                if line.startswith('>'):
                    if sequence_id is not None and sequence_lines:
                        sequence = ''.join(sequence_lines)
                        if c_terminus == "Fc":
                            c_terminus_seq = Fc_sequence
                            comment = "Fc added"
                        else:
                            c_terminus_seq = c_terminus
                            comment = ""
                        sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
                        
                        charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                        percent_charged = calculate_percent_charged(sequence_with_modifications)
                        percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                        extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                        abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                        isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                        molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                        
                        sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications, comment))
                    
                    sequence_id = line[1:]
                    sequence_lines = []
                else:
                    sequence_lines.append(line)
            
            if sequence_id is not None and sequence_lines:
                sequence = ''.join(sequence_lines)
                if c_terminus == "Fc":
                    c_terminus_seq = Fc_sequence
                    comment = "Fc added"
                else:
                    c_terminus_seq = c_terminus
                    comment = ""
                sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
                
                charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                percent_charged = calculate_percent_charged(sequence_with_modifications)
                percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                
                sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications, comment))
    
    output_dir = os.path.dirname(input_files[0])
    output_csv = os.path.join(output_dir, f'ProtParam_Results.csv')
    
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'C-Terminus', 'N-Terminus', 'Multimer Count', 'Analyzed Sequence', 'Comments'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term, n_term, multimer, seq_with_modifications, comment in sequences:
            c_term_display = c_term if c_term else "No C-terminus added"
            n_term_display = n_term if n_term else "No N-terminus added"
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term_display, n_term_display, multimer, seq_with_modifications, comment])
    
    print(f"Sequences exported to '{output_csv}'")
    input("Press the <ENTER> key to continue...")

def find_fasta_files(root_folder):
    fasta_files = []
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                fasta_files.append(os.path.join(root, file))
    return fasta_files

print("The directory must contain .fasta files to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_files = find_fasta_files(root_folder)

if input_files:
    print(f"Found FASTA files: {input_files}")
    
    multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
    multimer_count = int(multimer_count_input) if multimer_count_input else 1
    n_terminus = input("Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): ")
    c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): ")
    
    parse_fasta_and_export_with_protparam(input_files, c_terminus, n_terminus, multimer_count)
else:
    print("No FASTA files found in the specified root folder.")


In [None]:
# v 0.9
# Added to change name of csv to Fc added if it is added

import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

Fc_sequence = ("EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC"
              "SVMHEALHNHYTQKSLSLSPGK")

def calculate_charge_at_pH7(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    return analysed_seq.charge_at_pH(7.0)

def calculate_percent_charged(sequence):
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    return (charged_aa / total_aa) * 100

def calculate_percent_hydrophobic(sequence):
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    return (hydrophobic_aa / total_aa) * 100

def calculate_molar_extinction_coefficient_oxidized(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()
    return epsilon_prot[1]

def calculate_abs_0_1_percent_oxidized(sequence):
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    return extinction_coefficient / molecular_weight

def parse_fasta_and_export_with_protparam(input_file, c_terminus="", n_terminus="", multimer_count=1):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    if c_terminus == "Fc":
                        c_terminus_seq = Fc_sequence
                    else:
                        c_terminus_seq = c_terminus
                    sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
                    
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                    percent_charged = calculate_percent_charged(sequence_with_modifications)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                    isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                    
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications))
                
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            if c_terminus == "Fc":
                c_terminus_seq = Fc_sequence
            else:
                c_terminus_seq = c_terminus
            sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
            
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
            percent_charged = calculate_percent_charged(sequence_with_modifications)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
            isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
            
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications))
    
    output_dir = os.path.dirname(input_file)
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    if c_terminus == "Fc":
    output_csv = os.path.join(output_dir, f'{base_name}_Fc_Added_ProtParam_Results.csv')
        
    else
    output_csv = os.path.join(output_dir, f'{base_name}_ProtParam_Results.csv')
    
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'C-Terminus', 'N-Terminus', 'Multimer Count', 'Analyzed Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term, n_term, multimer, seq_with_modifications in sequences:
            c_term_display = c_term if c_term else "No C-terminus added"
            n_term_display = n_term if n_term else "No N-terminus added"
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term_display, n_term_display, multimer, seq_with_modifications])
    
    print(f"Sequences exported to '{output_csv}'")
    input("Press the <ENTER> key to continue...")

def find_fasta_file(root_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                return os.path.join(root, file)
    return None

print("The directory must contain a .fasta file to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_file = find_fasta_file(root_folder)

if input_file:
    print(f"Found FASTA file: {input_file}")
    
    multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
    multimer_count = int(multimer_count_input) if multimer_count_input else 1
    n_terminus = input("Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): ")
    c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): ")
    
    parse_fasta_and_export_with_protparam(input_file, c_terminus, n_terminus, multimer_count)
else:
    print("No FASTA file found in the specified root folder.")


In [None]:
# v 0.8
# Adjusted to place N-terminus at the beginning and C-terminus at the end
# Added the option to enter "Fc" for the C-terminus

import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

Fc_sequence = ("EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC"
              "SVMHEALHNHYTQKSLSLSPGK")

def calculate_charge_at_pH7(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    return analysed_seq.charge_at_pH(7.0)

def calculate_percent_charged(sequence):
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    return (charged_aa / total_aa) * 100

def calculate_percent_hydrophobic(sequence):
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    return (hydrophobic_aa / total_aa) * 100

def calculate_molar_extinction_coefficient_oxidized(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()
    return epsilon_prot[1]

def calculate_abs_0_1_percent_oxidized(sequence):
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    return extinction_coefficient / molecular_weight

def parse_fasta_and_export_with_protparam(input_file, c_terminus="", n_terminus="", multimer_count=1):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    if c_terminus == "Fc":
                        c_terminus_seq = Fc_sequence
                    else:
                        c_terminus_seq = c_terminus
                    sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
                    
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                    percent_charged = calculate_percent_charged(sequence_with_modifications)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                    isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                    
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications))
                
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            if c_terminus == "Fc":
                c_terminus_seq = Fc_sequence
            else:
                c_terminus_seq = c_terminus
            sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
            
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
            percent_charged = calculate_percent_charged(sequence_with_modifications)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
            isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
            
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications))
    
    output_dir = os.path.dirname(input_file)
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_csv = os.path.join(output_dir, f'{base_name}_ProtParam_Results.csv')
    
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'C-Terminus', 'N-Terminus', 'Multimer Count', 'Analyzed Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term, n_term, multimer, seq_with_modifications in sequences:
            c_term_display = c_term if c_term else "No C-terminus added"
            n_term_display = n_term if n_term else "No N-terminus added"
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term_display, n_term_display, multimer, seq_with_modifications])
    
    print(f"Sequences exported to '{output_csv}'")
    input("Press the <ENTER> key to continue...")

def find_fasta_file(root_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                return os.path.join(root, file)
    return None

print("The directory must contain a .fasta file to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_file = find_fasta_file(root_folder)

if input_file:
    print(f"Found FASTA file: {input_file}")
    
    multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
    multimer_count = int(multimer_count_input) if multimer_count_input else 1
    n_terminus = input("Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): ")
    c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): ")
    
    parse_fasta_and_export_with_protparam(input_file, c_terminus, n_terminus, multimer_count)
else:
    print("No FASTA file found in the specified root folder.")


In [3]:
# v 0.7
# Added the option to append an N-terminus and a C-terminus
# Added the option to make a multimer (dimer = 2)

import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

Fc_sequence = ("EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC"
               "SVMHEALHNHYTQKSLSLSPGK")

def calculate_charge_at_pH7(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    return analysed_seq.charge_at_pH(7.0)

def calculate_percent_charged(sequence):
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    return (charged_aa / total_aa) * 100

def calculate_percent_hydrophobic(sequence):
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    return (hydrophobic_aa / total_aa) * 100

def calculate_molar_extinction_coefficient_oxidized(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()
    return epsilon_prot[1]

def calculate_abs_0_1_percent_oxidized(sequence):
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    return extinction_coefficient / molecular_weight

def parse_fasta_and_export_with_protparam(input_file, n_terminus="", c_terminus="", multimer_count=1):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    if c_terminus == "Fc":
                        c_terminus_seq = Fc_sequence
                    else:
                        c_terminus_seq = c_terminus
                    sequence_with_modifications = (n_terminus + sequence + c_terminus_seq) * multimer_count
                    
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                    percent_charged = calculate_percent_charged(sequence_with_modifications)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                    isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                    
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, n_terminus, c_terminus_seq, multimer_count, sequence_with_modifications))
                
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            if c_terminus == "Fc":
                c_terminus_seq = Fc_sequence
            else:
                c_terminus_seq = c_terminus
            sequence_with_modifications = (n_terminus + sequence + c_terminus_seq) * multimer_count
            
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
            percent_charged = calculate_percent_charged(sequence_with_modifications)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
            isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
            
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, n_terminus, c_terminus_seq, multimer_count, sequence_with_modifications))
    
    output_dir = os.path.dirname(input_file)
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_csv = os.path.join(output_dir, f'{base_name}_ProtParam_Results.csv')
    
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'N-Terminus', 'C-Terminus', 'Multimer Count', 'Analyzed Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, n_term, c_term, multimer, seq_with_modifications in sequences:
            c_term_display = c_term if c_term else "No C-terminus added"
            n_term_display = n_term if n_term else "No N-terminus added"
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, n_term_display, c_term_display, multimer, seq_with_modifications])
    
    print(f"Sequences exported to '{output_csv}'")
    input("Press the <ENTER> key to continue...")

def find_fasta_file(root_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                return os.path.join(root, file)
    return None

print("The directory must contain a .fasta file to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_file = find_fasta_file(root_folder)

if input_file:
    print(f"Found FASTA file: {input_file}")
    
    multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
    multimer_count = int(multimer_count_input) if multimer_count_input else 1
    n_terminus = input("Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): ")
    c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): ")
    
    parse_fasta_and_export_with_protparam(input_file, n_terminus, c_terminus, multimer_count)
else:
    print("No FASTA file found in the specified root folder.")


The directory must contain a .fasta file to be used.
Please enter the path to the root folder: C:\Users\JoelTencer\OneDrive - Attovia\ATTO-006
Found FASTA file: C:\Users\JoelTencer\OneDrive - Attovia\ATTO-006\Revdofilimab Sequence.fasta
Enter the multimer count (Example: dimer = 2): 2
Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): 
Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): Fc
Sequences exported to 'C:\Users\JoelTencer\OneDrive - Attovia\ATTO-006\Revdofilimab Sequence_ProtParam_Results.csv'
Press the <ENTER> key to continue...


In [1]:
# v0.9

# v 0.8
# Added the option to append an N-terminus and a C-terminus
# Added the option to make a multimer (dimer = 2)

import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

Fc_sequence = ("EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC"
               "SVMHEALHNHYTQKSLSLSPGK")

def calculate_charge_at_pH7(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    return analysed_seq.charge_at_pH(7.0)

def calculate_percent_charged(sequence):
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    return (charged_aa / total_aa) * 100

def calculate_percent_hydrophobic(sequence):
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    return (hydrophobic_aa / total_aa) * 100

def calculate_molar_extinction_coefficient_oxidized(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()
    return epsilon_prot[1]

def calculate_abs_0_1_percent_oxidized(sequence):
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    return extinction_coefficient / molecular_weight

def parse_fasta_and_export_with_protparam(input_file, n_terminus="", c_terminus="", multimer_count=1):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    if c_terminus == "Fc":
                        c_terminus_seq = Fc_sequence
                    else:
                        c_terminus_seq = c_terminus
                    sequence_with_modifications = (n_terminus + sequence + c_terminus_seq) * multimer_count
                    
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                    percent_charged = calculate_percent_charged(sequence_with_modifications)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                    isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                    
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, n_terminus, c_terminus_seq, multimer_count, sequence_with_modifications))
                
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            if c_terminus == "Fc":
                c_terminus_seq = Fc_sequence
            else:
                c_terminus_seq = c_terminus
            sequence_with_modifications = (n_terminus + sequence + c_terminus_seq) * multimer_count
            
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
            percent_charged = calculate_percent_charged(sequence_with_modifications)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
            isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
            
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, n_terminus, c_terminus_seq, multimer_count, sequence_with_modifications))
    
    output_dir = os.path.dirname(input_file)
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_csv = os.path.join(output_dir, f'{base_name}_ProtParam_Results.csv')
    
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'C-Terminus', 'N-Terminus', 'Multimer Count', 'Analyzed Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, n_term, c_term, multimer, seq_with_modifications in sequences:
            c_term_display = c_term if c_term else "No C-terminus added"
            n_term_display = n_term if n_term else "No N-terminus added"
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term_display, n_term_display, multimer, seq_with_modifications])
    
    print(f"Sequences exported to '{output_csv}'")
    input("Press the <ENTER> key to continue...")

def find_fasta_file(root_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                return os.path.join(root, file)
    return None

print("The directory must contain a .fasta file to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_file = find_fasta_file(root_folder)

if input_file:
    print(f"Found FASTA file: {input_file}")
    
    multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
    multimer_count = int(multimer_count_input) if multimer_count_input else 1
    n_terminus = input("Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): ")
    c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): ")
    
    parse_fasta_and_export_with_protparam(input_file, n_terminus, c_terminus, multimer_count)
else:
    print("No FASTA file found in the specified root folder.")




The directory must contain a .fasta file to be used.
Please enter the path to the root folder: C:\Users\JoelTencer\OneDrive - Attovia\ATTO-006
Found FASTA file: C:\Users\JoelTencer\OneDrive - Attovia\ATTO-006\Revdofilimab Sequence.fasta
Enter the multimer count (Example: dimer = 2): 2
Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): 
Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): Fc
Sequences exported to 'C:\Users\JoelTencer\OneDrive - Attovia\ATTO-006\Revdofilimab Sequence_ProtParam_Results.csv'
Press the <ENTER> key to continue...


In [None]:
# v 0.8
# Adjusted to place N-terminus at the beginning and C-terminus at the end
# Added the option to enter "Fc" for the C-terminus

import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

Fc_sequence = ("EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC"
              "SVMHEALHNHYTQKSLSLSPGK")

def calculate_charge_at_pH7(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    return analysed_seq.charge_at_pH(7.0)

def calculate_percent_charged(sequence):
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    return (charged_aa / total_aa) * 100

def calculate_percent_hydrophobic(sequence):
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    return (hydrophobic_aa / total_aa) * 100

def calculate_molar_extinction_coefficient_oxidized(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()
    return epsilon_prot[1]

def calculate_abs_0_1_percent_oxidized(sequence):
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    return extinction_coefficient / molecular_weight

def parse_fasta_and_export_with_protparam(input_file, c_terminus="", n_terminus="", multimer_count=1):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    if c_terminus == "Fc":
                        c_terminus_seq = Fc_sequence
                    else:
                        c_terminus_seq = c_terminus
                    sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
                    
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                    percent_charged = calculate_percent_charged(sequence_with_modifications)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                    isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                    
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications))
                
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            if c_terminus == "Fc":
                c_terminus_seq = Fc_sequence
            else:
                c_terminus_seq = c_terminus
            sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
            
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
            percent_charged = calculate_percent_charged(sequence_with_modifications)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
            isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
            
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications))
    
    output_dir = os.path.dirname(input_file)
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_csv = os.path.join(output_dir, f'{base_name}_ProtParam_Results.csv')
    
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'N-Terminus', 'C-Terminus','Multimer Count', 'Analyzed Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term, n_term, multimer, seq_with_modifications in sequences:
            c_term_display = c_term if c_term else "No C-terminus added"
            n_term_display = n_term if n_term else "No N-terminus added"
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term_display, n_term_display, multimer, seq_with_modifications])
    
    print(f"Sequences exported to '{output_csv}'")
    input("Press the <ENTER> key to continue...")

def find_fasta_file(root_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                return os.path.join(root, file)
    return None

print("The directory must contain a .fasta file to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_file = find_fasta_file(root_folder)

if input_file:
    print(f"Found FASTA file: {input_file}")
    
    multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
    multimer_count = int(multimer_count_input) if multimer_count_input else 1
    n_terminus = input("Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): ")
    c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): ")
    
    parse_fasta_and_export_with_protparam(input_file, c_terminus, n_terminus, multimer_count)
else:
    print("No FASTA file found in the specified root folder.")


In [None]:
# v 0.7
# Added the option to append an N-terminus and a C-terminus
# Added the option to make a multimer (dimer = 2)

import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    return analysed_seq.charge_at_pH(7.0)

def calculate_percent_charged(sequence):
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    return (charged_aa / total_aa) * 100

def calculate_percent_hydrophobic(sequence):
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    return (hydrophobic_aa / total_aa) * 100

def calculate_molar_extinction_coefficient_oxidized(sequence):
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()
    return epsilon_prot[1]

def calculate_abs_0_1_percent_oxidized(sequence):
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    return extinction_coefficient / molecular_weight

def parse_fasta_and_export_with_protparam(input_file, n_terminus="", _terminus="", multimer_count=1):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_modifications = (n_terminus + sequence + c_terminus) * multimer_count
                    
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
                    percent_charged = calculate_percent_charged(sequence_with_modifications)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
                    isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
                    
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, n_terminus, c_terminus, multimer_count, sequence_with_modifications))
                
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_modifications = (n_terminus + sequence) * multimer_count + n_terminus
            
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
            percent_charged = calculate_percent_charged(sequence_with_modifications)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
            isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
            
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus, n_terminus, multimer_count, sequence_with_modifications))
    
    output_dir = os.path.dirname(input_file)
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_csv = os.path.join(output_dir, f'{base_name}_ProtParam_Results.csv')
    
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'C-Terminus', 'N-Terminus', 'Multimer Count', 'Analyzed Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term, n_term, multimer, seq_with_modifications in sequences:
            c_term_display = c_term if c_term else "No C-terminus added"
            n_term_display = n_term if n_term else "No N-terminus added"
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term_display, n_term_display, multimer, seq_with_modifications])
    
    print(f"Sequences exported to '{output_csv}'")
    input("Press the <ENTER> key to continue...")

def find_fasta_file(root_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                return os.path.join(root, file)
    return None

print("The directory must contain a .fasta file to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_file = find_fasta_file(root_folder)

if input_file:
    print(f"Found FASTA file: {input_file}")
    
    multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
    multimer_count = int(multimer_count_input) if multimer_count_input else 1
    n_terminus = input("Enter the N-Terminus to concatenate at the beginning (Press enter for no C-Terminus): ")
    c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no N-Terminus): ")
    
    parse_fasta_and_export_with_protparam(input_file, c_terminus, n_terminus, multimer_count)
else:
    print("No FASTA file found in the specified root folder.")


In [None]:
# v 0_6
# Removed Cterm appending
# Added input option for dimer, trimer etc. for calculation

import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, append_str=""):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_append = sequence + append_str
                    
                    # Calculate Protein Analysis Parameters using the appended sequence
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
                    percent_charged = calculate_percent_charged(sequence_with_append)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
                    isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_append = sequence + append_str
            
            # Calculate Protein Analysis Parameters using the appended sequence
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
            percent_charged = calculate_percent_charged(sequence_with_append)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
            isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
    
    # Get the directory and base name of the input file
    output_dir = os.path.dirname(input_file)
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    # Set the output CSV file path
    output_csv = os.path.join(output_dir, f'{base_name}_ProtParam_Results.csv')
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'Sequence Appended'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append])

    print(f"Sequences exported to '{output_csv}'")
    
    # Input pause so user can see result path
    input("Press the <ENTER> key to continue...")

def find_fasta_file(root_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                return os.path.join(root, file)
    return None

# Main script
print("The directory must contain a .fasta file to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_file = find_fasta_file(root_folder)

if input_file:
    print(f"Found FASTA file: {input_file}")
    
    # Ask user for multimer count
    multimer_count = int(input("Enter the multimer count (Example: dimer = 2): "))
    
    # Get the sequence to append
    append_str = input("Enter the sequence to append: ")
    append_str = append_str * multimer_count
    
    parse_fasta_and_export_with_protparam(input_file, append_str)
else:
    print("No FASTA file found in the specified root folder.")


In [None]:
# v0_54
# csv output does not need to be specified, script will save to same location as file input.
# File path 'not' entered by user

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv
import os

# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, append_str=""):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_append = sequence + append_str
                    
                    # Calculate Protein Analysis Parameters using the appended sequence
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
                    percent_charged = calculate_percent_charged(sequence_with_append)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
                    isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_append = sequence + append_str
            
            # Calculate Protein Analysis Parameters using the appended sequence
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
            percent_charged = calculate_percent_charged(sequence_with_append)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
            isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
    
    # Get the directory and base name of the input file
    output_dir = os.path.dirname(input_file)
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    # Set the output CSV file path
    output_csv = os.path.join(output_dir, f'{base_name}_ProtParam_Results.csv')
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'Sequence Appended'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append])

    print(f"Sequences exported to '{output_csv}'")
    
    # Input pause so user can see result path
    input("Press the <ENTER> key to continue...")

def find_fasta_file(root_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                return os.path.join(root, file)
    return None

# Usage
root_folder = input("Please enter the path to the root folder: ")
input_file = find_fasta_file(root_folder)

if input_file:
    print(f"Found FASTA file: {input_file}")
    parse_fasta_and_export_with_protparam(input_file, append_str)
else:
    print("No FASTA file found in the specified root folder.")


In [None]:
# v0_53
# Script will ask user for folder path of .fasta file and then look for a .fasta file to analyze
# csv output does not need to be specified, script will save to same location as file input.

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv
import os

# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, append_str=""):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_append = sequence + append_str
                    
                    # Calculate Protein Analysis Parameters using the appended sequence
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
                    percent_charged = calculate_percent_charged(sequence_with_append)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
                    isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_append = sequence + append_str
            
            # Calculate Protein Analysis Parameters using the appended sequence
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
            percent_charged = calculate_percent_charged(sequence_with_append)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
            isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
    
    # Get the directory of the input file
    output_dir = os.path.dirname(input_file)
    # Set the output CSV file path
    output_csv = os.path.join(output_dir,'ProtParam_Results.csv')
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'Sequence Appended'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append])

    print(f"Sequences exported to '{output_csv}'")

def find_fasta_file(root_folder):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.fasta'):
                return os.path.join(root, file)
    return None

# Usage
root_folder = input("Please enter the path to the root folder: ")
input_file = find_fasta_file(root_folder)

if input_file:
    print(f"Found FASTA file: {input_file}")
    parse_fasta_and_export_with_protparam(input_file, append_str)
else:
    print("No FASTA file found in the specified root folder.")


In [None]:
# v0_52
# csv output does not need to be specified, script will save to same location as file input.
# File path 'not' entered by user

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv
import os

input_file = r"C:\Users\JoelTencer\Documents\TL1A Nanobody Selected for Jonathan 1-19 text.fasta"

# Get the directory of the input file
output_dir = os.path.dirname(input_file)

# Set the output CSV file path
output_csv = os.path.join(output_dir, 'output_sequences_with_protparam.csv')

# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, append_str=""):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_append = sequence + append_str
                    
                    # Calculate Protein Analysis Parameters using the appended sequence
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
                    percent_charged = calculate_percent_charged(sequence_with_append)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
                    isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_append = sequence + append_str
            
            # Calculate Protein Analysis Parameters using the appended sequence
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
            percent_charged = calculate_percent_charged(sequence_with_append)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
            isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'Sequence Appended'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append])

    print(f"Sequences exported to '{output_csv}'")

# Usage
parse_fasta_and_export_with_protparam(input_file, append_str)


In [None]:
# v0_51
# Now asks the user for the file path and save csv to the same folder
# Works but Windows 11 defaults hide folder paths making this confusing...

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv
import os

# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, append_str=""):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_append = sequence + append_str
                    
                    # Calculate Protein Analysis Parameters using the appended sequence
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
                    percent_charged = calculate_percent_charged(sequence_with_append)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
                    isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_append = sequence + append_str
            
            # Calculate Protein Analysis Parameters using the appended sequence
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
            percent_charged = calculate_percent_charged(sequence_with_append)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
            isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
    
    # Get the directory of the input file
    output_dir = os.path.dirname(input_file)
    # Set the output CSV file path
    output_csv = os.path.join(output_dir, 'output_sequences_with_protparam.csv')
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'Sequence Appended'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append])

    print(f"Sequences exported to '{output_csv}'")

# Usage
input_file = input("Please enter the path to your FASTA file: ")
parse_fasta_and_export_with_protparam(input_file, append_str)


In [None]:
# v0_5
# Columns moved around

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

# Set input file path and output csv names
input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv"

# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, output_csv, append_str=""):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_append = sequence + append_str
                    
                    # Calculate Protein Analysis Parameters using the appended sequence
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
                    percent_charged = calculate_percent_charged(sequence_with_append)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
                    isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_append = sequence + append_str
            
            # Calculate Protein Analysis Parameters using the appended sequence
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
            percent_charged = calculate_percent_charged(sequence_with_append)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
            isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, sequence_with_append))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'Sequence Appended'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, seq_with_append])

    print(f"Sequences exported to '{output_csv}'")

# Usage
parse_fasta_and_export_with_protparam(input_file, output_csv, append_str)


In [None]:
# v0_4
# Output both binder sequence and sequence with Fc, but only analyze the sequence + appended.

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

# Set input file path and output csv names
input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv"


# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"


def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, output_csv, append_str=""):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequence_with_append = sequence + append_str
                    
                    # Calculate Protein Analysis Parameters using the appended sequence
                    charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
                    percent_charged = calculate_percent_charged(sequence_with_append)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
                    isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence, sequence_with_append))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequence_with_append = sequence + append_str
            
            # Calculate Protein Analysis Parameters using the appended sequence
            charge_at_pH7 = calculate_charge_at_pH7(sequence_with_append)
            percent_charged = calculate_percent_charged(sequence_with_append)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_append)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_append)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_append)
            isoelectric_point = ProteinAnalysis(sequence_with_append).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence_with_append).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence, sequence_with_append))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Abs 0.1% Oxidized', 'Isoelectric Point', 'MW in daltons', 'Original Sequence', 'Appended Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq, seq_with_append in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq, seq_with_append])

    print(f"Sequences exported to '{output_csv}'")

# Usage
parse_fasta_and_export_with_protparam(input_file, output_csv, append_str)


In [None]:
# v0_3
# Added ability to appned str (Fc sequence)

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

# Set input file path and output csv names
input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv"

# To Add Fc at C-terminus or another sequence define append_str with the sequence in quotes.
append_str = "EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC SVMHEALHNHYTQKSLSLSPGK"


def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, output_csv, append_str):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines) + append_str
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines) + append_str
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Abs 0.1% Oxidized', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Usage
parse_fasta_and_export_with_protparam(input_file, output_csv, append_str)


In [None]:
#v0_2

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

# Set input file path and output csv names
input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output_sequences_with_protparam.csv"


def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(input_file, output_csv):
    sequences = []
    
    with open(input_file, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Abs 0.1% Oxidized', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Usage
parse_fasta_and_export_with_protparam(input_file, output_csv)


In [None]:
# Added Abs 0.1% oxidized by dividing molar extinction coeff by molecular weight (oxidized)

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def calculate_abs_0_1_percent_oxidized(sequence):
    # Calculate Abs 0.1% (oxidized)
    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
    abs_0_1_percent = extinction_coefficient / molecular_weight
    return abs_0_1_percent

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
                    abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
            abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, abs_0_1_percent, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Abs 0.1% Oxidized', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, abs_0_1_percent, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


In [None]:
#ChatGPT
# Changed molar extinction coeffient to use BioPython's function and return oxidized

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient_oxidized(sequence):
    # Calculate molar extinction coefficient for oxidized cysteines
    analysed_seq = ProteinAnalysis(sequence)
    epsilon_prot = analysed_seq.molar_extinction_coefficient()  # [reduced, oxidized]
    return epsilon_prot[1]  # Return the oxidized form coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = ProteinAnalysis(sequence).molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = ProteinAnalysis(sequence).molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Molecular Extinction Coefficient Oxidized', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)




In [None]:
#ChatGPT
#fixed column order to match Lam's
#working!!!

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient(sequence):
    # Calculate molar extinction coefficient using known values for amino acids
    extinction_coefficient = 0
    aa_extinction = {
        'W': 5500,
        'Y': 1490,
        'C': 125
    }
    for aa, extinction in aa_extinction.items():
        extinction_coefficient += sequence.count(aa) * extinction
    return extinction_coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    molecular_weight = analysed_seq.molecular_weight()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point, molecular_weight, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            molecular_weight = analysed_seq.molecular_weight()
            
            # Append data to sequences list
            sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point, molecular_weight, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Extinction Coefficient M-1cm-1', 'Isoelectric Point', 'MW in daltons', 'Sequence'])
        for seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI, mw, seq in sequences:
            writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI, mw, seq])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


In [None]:
#ChatGPT
#Working, but results are slightly different than ExPasy, same as I found previously


from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient(sequence):
    # Calculate molar extinction coefficient using known values for amino acids
    extinction_coefficient = 0
    aa_extinction = {
        'W': 5500,
        'Y': 1490,
        'C': 125
    }
    for aa, extinction in aa_extinction.items():
        extinction_coefficient += sequence.count(aa) * extinction
    return extinction_coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    
                    # Calculate Protein Analysis Parameters
                    charge_at_pH7 = calculate_charge_at_pH7(sequence)
                    percent_charged = calculate_percent_charged(sequence)
                    percent_hydrophobic = calculate_percent_hydrophobic(sequence)
                    extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
                    isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
                    
                    # Append data to sequences list
                    sequences.append((sequence_id, sequence, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Extinction Coefficient M-1cm-1', 'Isoelectric Point'])
        for seq_id, seq, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI in sequences:
            writer.writerow([seq_id, seq, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)



In [None]:
#ChatGPT
#it added the DNA sequences back in.....

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient(sequence):
    # Calculate molar extinction coefficient using known values for amino acids
    extinction_coefficient = 0
    aa_extinction = {
        'W': 5500,
        'Y': 1490,
        'C': 125
    }
    for aa, extinction in aa_extinction.items():
        extinction_coefficient += sequence.count(aa) * extinction
    return extinction_coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    with open(file_path, 'r', encoding='utf-8') as file:
        for seq_record in SeqIO.parse(file, "fasta"):
            sequence_id = seq_record.id
            sequence = str(seq_record.seq)
            
            # Validate sequence characters (optional)
            valid_sequence = True
            for char in sequence:
                if char not in 'ACDEFGHIKLMNPQRSTVWY':
                    valid_sequence = False
                    break
            
            if not valid_sequence:
                print(f"Skipping sequence '{sequence_id}' due to invalid characters.")
                continue
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
            isoelectric_point = ProteinAnalysis(sequence).isoelectric_point()
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, percent_charged, percent_hydrophobic, charge_at_pH7, extinction_coefficient, isoelectric_point))
    
    # Export to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Extinction Coefficient M-1cm-1', 'Isoelectric Point'])
        for seq_id, seq, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI in sequences:
            writer.writerow([seq_id, seq, percent_charged, percent_hydrophobic, charge_pH7, extinction_coefficient, pI])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


In [None]:
#ChatGPT
#Added pI

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def calculate_charge_at_pH7(sequence):
    # Calculate charge at pH 7
    analysed_seq = ProteinAnalysis(sequence)
    charge_at_pH7 = analysed_seq.charge_at_pH(7.0)
    return charge_at_pH7

def calculate_percent_charged(sequence):
    # Calculate percentage of charged amino acids (DEHKR)
    charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
    total_aa = len(sequence)
    percent_charged = (charged_aa / total_aa) * 100
    return percent_charged

def calculate_percent_hydrophobic(sequence):
    # Calculate percentage of hydrophobic amino acids (AILMFWYV)
    hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
    total_aa = len(sequence)
    percent_hydrophobic = (hydrophobic_aa / total_aa) * 100
    return percent_hydrophobic

def calculate_molar_extinction_coefficient(sequence):
    # Calculate molar extinction coefficient using known values for amino acids
    extinction_coefficient = 0
    aa_extinction = {
        'W': 5500,
        'Y': 1490,
        'C': 125
    }
    for aa, extinction in aa_extinction.items():
        extinction_coefficient += sequence.count(aa) * extinction
    return extinction_coefficient

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    with open(file_path, 'r', encoding='utf-8') as file:
        for seq_record in SeqIO.parse(file, "fasta"):
            sequence_id = seq_record.id
            sequence = str(seq_record.seq)
            
            # Validate sequence characters (optional)
            valid_sequence = True
            for char in sequence:
                if char not in 'ACDEFGHIKLMNPQRSTVWY':
                    valid_sequence = False
                    break
            
            if not valid_sequence:
                print(f"Skipping sequence '{sequence_id}' due to invalid characters.")
                continue
            
            # Calculate Protein Analysis Parameters
            charge_at_pH7 = calculate_charge_at_pH7(sequence)
            percent_charged = calculate_percent_charged(sequence)
            percent_hydrophobic = calculate_percent_hydrophobic(sequence)
            extinction_coefficient = calculate_molar_extinction_coefficient(sequence)
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, charge_at_pH7, percent_charged, percent_hydrophobic, extinction_coefficient))
    
    # Export to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', 'Charge at pH 7', '% Charged AAs', '% Hydrophobic AAs', 'Molar Extinction Coefficient'])
        for seq_id, seq, charge_pH7, percent_charged, percent_hydrophobic, extinction_coefficient in sequences:
            writer.writerow([seq_id, seq, charge_pH7, percent_charged, percent_hydrophobic, extinction_coefficient])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


In [None]:
#ChatGPT
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    with open(file_path, 'r', encoding='utf-8') as file:
        for seq_record in SeqIO.parse(file, "fasta"):
            sequence_id = seq_record.id
            sequence = str(seq_record.seq)
            
            # Validate sequence characters
            valid_sequence = True
            for char in sequence:
                if char not in 'ACDEFGHIKLMNPQRSTVWY':
                    valid_sequence = False
                    break
            
            if not valid_sequence:
                print(f"Skipping sequence '{sequence_id}' due to invalid characters.")
                continue
            
            # Calculate Protein Analysis Parameters using BioPython's ProtParam
            analysed_seq = ProteinAnalysis(sequence)
            pI = analysed_seq.isoelectric_point()
            molecular_weight = analysed_seq.molecular_weight()
            cysteine_count = sequence.count('C')  # Count of cysteine residues
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, pI, molecular_weight, cysteine_count))
    
    # Export to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', 'Isoelectric Point (pI)', 'Molecular Weight (Da)', 'Cysteine Count'])
        for seq_id, seq, pI, mw, cysteines in sequences:
            writer.writerow([seq_id, seq, pI, mw, cysteines])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


In [None]:
#ChatGPT
#broken

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    with open(file_path, 'r', encoding='utf-8') as file:
        for seq_record in SeqIO.parse(file, "fasta"):
            sequence_id = seq_record.id
            sequence = str(seq_record.seq)
            
            # Calculate Protein Analysis Parameters using BioPython's ProtParam
            analysed_seq = ProteinAnalysis(sequence)
            pI = analysed_seq.isoelectric_point()
            molecular_weight = analysed_seq.molecular_weight()
            cysteine_count = sequence.count('C')  # Count of cysteine residues
            
            # Append data to sequences list
            sequences.append((sequence_id, sequence, pI, molecular_weight, cysteine_count))
    
    # Export to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', 'Isoelectric Point (pI)', 'Molecular Weight (Da)', 'Cysteine Count'])
        for seq_id, seq, pI, mw, cysteines in sequences:
            writer.writerow([seq_id, seq, pI, mw, cysteines])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


In [None]:
#ChatGPT script to parse fasta first 2 lines, then get protparam pI, MW, and abs 0.1% oxidized

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv

def parse_fasta_and_export_with_protparam(file_path, output_csv):
    sequences = []
    
    # Parse FASTA file and store sequences
    for seq_record in SeqIO.parse(file_path, "fasta"):
        sequence_id = seq_record.id
        sequence = str(seq_record.seq)
        
        # Calculate Protein Analysis Parameters using BioPython's ProtParam
        analysed_seq = ProteinAnalysis(sequence)
        pI = analysed_seq.isoelectric_point()
        molecular_weight = analysed_seq.molecular_weight()
        cysteine_count = sequence.count('C')  # Count of cysteine residues
        
        # Append data to sequences list
        sequences.append((sequence_id, sequence, pI, molecular_weight, cysteine_count))
    
    # Export to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence', 'Isoelectric Point (pI)', 'Molecular Weight (Da)', 'Cysteine Count'])
        for seq_id, seq, pI, mw, cysteines in sequences:
            writer.writerow([seq_id, seq, pI, mw, cysteines])

    print(f"Sequences exported to '{output_csv}'")

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences_with_protparam.csv"
parse_fasta_and_export_with_protparam(file_path, output_csv)


In [None]:
#ChatGPT Working script to parse and take only first 2 lines of the fasta sequence

from Bio import SeqIO
import csv

def parse_fasta_and_export(file_path, output_csv):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Sequence Name', 'Sequence'])
        for seq_id, sequence in sequences:
            writer.writerow([seq_id, sequence])

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
output_csv = r"output_sequences.csv"
parse_fasta_and_export(file_path, output_csv)
print(f"Sequences exported to '{output_csv}'")


In [None]:
#ChatGPT Working script, just takes the first 2 lines of the fasta sequence

from Bio import SeqIO
def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_count = 0  # Initialize line counter
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_count = 0  # Reset line counter
            else:
                # Collect up to two lines of the sequence
                if line_count < 2:
                    sequence_lines.append(line)
                    line_count += 1
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


In [None]:
def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_break_count = 0  # Initialize line break counter
        
        for line in file:
            if line == '\n':
                line_break_count += 1
                if line_break_count >= 1:
                    # Two consecutive line breaks detected, skip the current sequence
                    sequence_id = None
                    sequence_lines = []
                    line_break_count = 0  # Reset the counter
                    continue
            else:
                line_break_count = 0  # Reset the counter if it's not a line break
            
            line = line.strip()
            
            if line.startswith('>'):
                # Save the current sequence if there is one
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


In [None]:
def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        previous_line_blank = False  # Flag to track if the previous line was blank
        
        for line in file:
            line = line.strip()
            print(line)
            
            if not line:
                if previous_line_blank:
                    # Two consecutive blank lines detected, skip the current sequence
                    sequence_id = None
                    sequence_lines = []
                    previous_line_blank = False  # Reset the flag
                    continue
                else:
                    previous_line_blank = True  # Set the flag for the first blank line
            else:
                previous_line_blank = False  # Reset the flag if the current line is not blank
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
            else:
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(line)
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


In [None]:
# view raw text file
Text_File_Import = open(r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt")
Text_lines = Text_File_Import.readlines()

for line in Text_lines:
    User_Inputs = line.split(' ')


print(Text_lines)
print(User_Inputs)

In [None]:
from Bio import SeqIO

def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        invalid_sequence = False  # Initialize invalid sequence flag
        
        for line in file:
            line = line.strip()
            
            if not line:
                # Skip blank lines
                continue
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines and not invalid_sequence:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                invalid_sequence = False  # Reset invalid sequence flag
            else:
                # Check for spaces in the sequence line
                if ' ' in line:
                    invalid_sequence = True
                    sequence_id = None
                    sequence_lines = []
                    continue
                
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines and not invalid_sequence:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


In [None]:
#chatgpt 3, skip if 2 spaces, doesn't work
from Bio import SeqIO

def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        space_count = 0  # Initialize space counter
        
        for line in file:
            line = line.strip()
            
            if not line:
                # Skip blank lines
                continue
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines and space_count < 2:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                space_count = 0  # Reset space counter
            else:
                # Increment space counter if a space is detected
                space_count += line.count(' ')
                if space_count >= 2:
                    sequence_id = None
                    sequence_lines = []
                    continue
                
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines and space_count < 2:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))



In [None]:
#chatgpt 2, skip if 2 linebreaks detected
from Bio import SeqIO

def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        line_break_count = 0  # Initialize line break counter
        
        for line in file:
            line = line.strip()
            
            if not line:
                # Skip blank lines
                continue
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines and line_break_count < 1:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                line_break_count = 0  # Reset line break counter
            else:
                # Increment line break counter if a line break is detected
                if '\n' in line:
                    line_break_count += 1
                    if line_break_count >= 1:
                        sequence_id = None
                        sequence_lines = []
                        continue
                
                sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines and line_break_count < 2:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


In [None]:
from Bio import SeqIO

input_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"

output_file = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\output.txt"

fasta_sequences = SeqIO.parse(open(input_file),'fasta')

with open(output_file) as out_file:
    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        #new_sequence = some_function(sequence)
        #write_fasta(out_file)

In [None]:
#ChatGPT

from Bio import SeqIO

def parse_fasta_with_skip(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence_id = None
        sequence_lines = []
        valid_sequence = True  # Initialize valid_sequence at the beginning
        
        for line in file:
            line = line.strip()
            
            if not line:
                # Skip blank lines
                continue
            
            if line.startswith('>'):
                # Save the current sequence if there is one and it is valid
                if sequence_id is not None and sequence_lines and valid_sequence:
                    sequence = ''.join(sequence_lines)
                    sequences.append((sequence_id, sequence))
                
                # Start a new sequence
                sequence_id = line[1:]
                sequence_lines = []
                valid_sequence = True  # Reset valid sequence flag
            else:
                # If a line break is detected, skip this sequence
                if '\n' in line:
                    sequence_id = None
                    sequence_lines = []
                    valid_sequence = False
                    continue
                
                if valid_sequence:
                    sequence_lines.append(line)
        
        # Save the last sequence if it is valid
        if sequence_id is not None and sequence_lines and valid_sequence:
            sequence = ''.join(sequence_lines)
            sequences.append((sequence_id, sequence))
    
    return sequences

# Example usage
file_path = r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt"
parsed_sequences = parse_fasta_with_skip(file_path)
for seq_id, sequence in parsed_sequences:
    print(f'>{seq_id}')
    print(sequence)
    print(len(sequence))


In [None]:
>>> from Bio import SeqIO
>>> for seq_record in SeqIO.parse(r"C:\Users\JoelTencer\OneDrive - Attovia\ATTO-005\TL1A Nanobody Selected for Jonathan text.txt", "fasta"):
...     print(seq_record.id)
...     print(repr(seq_record.seq))
...     print(len(seq_record))