In [69]:
import csv

def parse_txt_to_csv(txt_file, csv_file):
    # Open the text file for reading
    with open(txt_file, 'r') as file:
        lines = file.readlines()

    # Initialize lists to store data
    data = []
    current_locus_tag = None
    current_start = current_end = None
    current_gene = None
    current_product = None
    current_ec_number = None
    current_protein_id = None

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Process feature lines
        if line[0].isdigit():
            if current_locus_tag is not None:
                # Save previous feature data if any
                data.append({
                    'Locus Tag': current_locus_tag,
                    'Start': current_start,
                    'End': current_end,
                    'Gene': current_gene,
                    'Product': current_product,
                    'Protein ID': current_protein_id,
                    'EC Number': current_ec_number
                })

            # Parse new feature information
            parts = line.split()
            if len(parts) >= 3:
                current_start = parts[0]
                current_end = parts[1]
                current_feature_type = parts[2]
            else:
                print(f"Warning: Line with unexpected format: {line}")

            # Reset fields for new feature
            current_gene = None
            current_product = None
            current_ec_number = None
            current_protein_id = None
            
        elif line.startswith('locus_tag'):
            current_locus_tag = line.split('\t')[1] if len(line.split('\t')) > 1 else None
        
        elif line.startswith('product'):
            current_product = line.split('\t')[1] if len(line.split('\t')) > 1 else None
        
        elif line.startswith('gene'):
            current_gene = line.split('\t')[1] if len(line.split('\t')) > 1 else None
        
        elif line.startswith('EC_number'):
            current_ec_number = line.split('\t')[1] if len(line.split('\t')) > 1 else None

        elif line.startswith('protein_id'):
            # Extract only the relevant part of the protein_id
            full_protein_id = line.split('\t')[1] if len(line.split('\t')) > 1 else None
            if full_protein_id:
                current_protein_id = full_protein_id.split('|')[1]  # Extract the part before the first '|'
    
    # Add the last entry
    if current_locus_tag is not None:
        data.append({
            'Locus Tag': current_locus_tag,
            'Start': current_start,
            'End': current_end,
            'Gene': current_gene,
            'Product': current_product,
            'Protein ID': current_protein_id,
            'EC Number': current_ec_number
        })
    
    # Merge rows with the same Locus Tag, Start, and End into a single row
    merged_data = {}
    for row in data:
        key = (row['Locus Tag'], row['Start'], row['End'])
        if key not in merged_data:
            merged_data[key] = row
        else:
            merged_data[key]['Gene'] = row['Gene'] if row['Gene'] else merged_data[key]['Gene']
            merged_data[key]['Product'] = row['Product'] if row['Product'] else merged_data[key]['Product']
            merged_data[key]['EC Number'] = row['EC Number'] if row['EC Number'] else merged_data[key]['EC Number']
            merged_data[key]['Protein ID'] = row['Protein ID'] if row['Protein ID'] else merged_data[key]['Protein ID']
    
    # Convert merged data to list
    merged_data_list = list(merged_data.values())
    
    # Write merged data to CSV
    with open(csv_file, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['Locus Tag', 'Start', 'End', 'Gene', 'Product', 'Protein ID', 'EC Number'])
        writer.writeheader()
        writer.writerows(merged_data_list)

# Example usage
txt_file = '630_Feature Table.txt'  # Replace with your input TXT file name
csv_file = '630_Feature Table.csv'  # Replace with your output CSV file name

parse_txt_to_csv(txt_file, csv_file)
print(f"Data has been written to '{csv_file}'")

Data has been written to '630_Feature Table.csv'


In [72]:
import pandas as pd

def extract_sequence_from_fasta(fasta_file, start, end):
    with open(fasta_file, 'r') as file:
        lines = file.readlines()
    
    # Join all lines into a single string and remove the header lines
    sequence = ''.join(line.strip() for line in lines if not line.startswith('>'))
    
    # Ensure the range is within the sequence length
    if start < 0 or end > len(sequence):
        raise Exception("Range is out of bounds of the sequence length.")
    
    # Extract the desired range from the sequence
    extracted_sequence = sequence[start:end]
    
    return extracted_sequence

def process_csv_and_fasta(csv_file, fasta_file, output_csv_full, output_csv_nt_seq):
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    # Initialize a list to store NT sequences
    nt_sequences = []
    
    for _, row in df.iterrows():
        locus_tag = row['Locus Tag']
        start = int(row['Start'])
        end = int(row['End'])
        
        # Swap start and end if start is greater than end
        if start > end:
            start, end = end, start
        
        # Adjust start to be zero-based
        start -= 1
        
        try:
            nt_seq = extract_sequence_from_fasta(fasta_file, start, end)
            nt_sequences.append(nt_seq)
        except Exception as e:
            print(f"Error processing {locus_tag}: {e}")
            nt_sequences.append(None)
    
    # Add the NT Seq column to the DataFrame
    df['NT Seq'] = nt_sequences
    
    # Write the updated DataFrame with all columns to the output CSV file
    df.to_csv(output_csv_full, index=False)
    
    # Create a DataFrame with only the Locus Tag and NT Seq columns
    nt_seq_df = df[['Locus Tag', 'NT Seq']]
    
    # Write the simplified DataFrame to the output CSV file
    nt_seq_df.to_csv(output_csv_nt_seq, index=False)

# Example usage
csv_file = 'S0253_Feature Table.csv'  
fasta_file = 'S0253_complete genome.fasta' 
output_csv_full = 'S0253_Feature Table_with_NT_Sequences.csv'  
output_csv_nt_seq = 'S0253_NT_Sequences.csv'  

process_csv_and_fasta(csv_file, fasta_file, output_csv_full, output_csv_nt_seq)
print(f"Extracted sequences saved to '{output_csv_full}' and '{output_csv_nt_seq}'")

Extracted sequences saved to 'S0253_Feature Table_with_NT_Sequences.csv' and 'S0253_NT_Sequences.csv'
