In [3]:
import csv
import os
import re

def subtract_length_from_numbers(genbank_map, length):
    # Define a regular expression pattern to match numbers greater than or equal to 678
    pattern = re.compile(r'\b(?:678|679|[7-9]\d{2,}|[1-9]\d{3,})\b')
    
    # Replace numbers >= 678 in the genbank map with their updated values
    updated_genbank_map = pattern.sub(lambda x: str(int(x.group()) - length), genbank_map)
    
    return updated_genbank_map

def replace_name_in_genbank(genbank_map, name):
    # Define the pattern to match "MERS" followed by spaces
    pattern = re.compile(r'(MERS\s+)')

    # Replace "MERS" followed by spaces with the new name and desired number of spaces
    genbank_map = pattern.sub(name + ' ' * (len(pattern.findall(genbank_map)[0]) - len(name)), genbank_map)
    
    return genbank_map

# Create a subfolder for output files if it doesn't exist
output_folder = 'gb_outputs'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Open the CSV file and process each line
with open('processed_seqs.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        name, length, genbank_map = row
        length = int(length)
        
        # Step 1: Subtract length from numbers >= 678 in the genbank map
        genbank_map_step1 = subtract_length_from_numbers(genbank_map, length)
        
        # Step 2: Replace "MERS" with the value in the "name" column and adjust spaces
        genbank_map_step2 = replace_name_in_genbank(genbank_map_step1, name)
        
        # Write the processed genbank map to a file with the name as specified
        output_file_path = os.path.join(output_folder, name + '.gb')
        with open(output_file_path, 'w') as outfile:
            outfile.write(genbank_map_step2)


Manually open each gb file in ApE, and resave it without modification. This reindexes the numbering in the 'sequence' portion of the genbank file. However, it also resets the spacing in the top line which causes errors in the genbank parser we use to input these into the PacBio processing script, so we need to reload in each gb file, update the spacing again, and then concatenate them together into our final output file.

In [19]:
import os
import re

def adjust_genbank_files(folder_path):
    final_genbank_content = []

    # Iterate through each genbank file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.startswith('.'):
            continue  # Skip hidden files
        if not file_name.endswith('.gb'):
            continue  # Skip files that are not genbank files

        # Construct the full file path
        file_path = os.path.join(folder_path, file_name)

        # Extract the full sequence name from the filename (minus the ".gb" suffix)
        sequence_name = os.path.splitext(file_name)[0]

        # Read the contents of the genbank file
        with open(file_path, 'r', encoding='latin-1') as file:
            genbank_content = file.readlines()

        # Modify the first line to adjust the sequence name and spaces before "bp"
        for index, line in enumerate(genbank_content):
            if line.startswith('LOCUS'):
                # Calculate the number of additional spaces needed
                additional_spaces = 27 - len(sequence_name)

                # Replace the sequence name and adjust spaces before "bp"
                new_line = re.sub(r'LOCUS\s+\S+\s+(\s*\d+\s+bp)', f'LOCUS       {sequence_name.ljust(16)}{" " * additional_spaces}\\1', line)
                genbank_content[index] = new_line
                break

        # Add the modified genbank content to the final genbank content list
        final_genbank_content.extend(genbank_content)

    # Write the final genbank content to the new file
    with open('final_genbanks.gb', 'w', encoding='latin-1') as file:
        file.writelines(final_genbank_content)

# Specify the folder path containing the genbank files
folder_path = 'gb_outputs'

# Call the function to adjust the genbank files in the specified folder
adjust_genbank_files(folder_path)
