In [2]:
import pandas as pd
import os
import glob
from Bio import SeqIO
import subprocess
fastaFile = 'Genomes/DOUBLETESTER.fasta'
Mito = 'Mitochondria_NC_012920_1.fasta'



#######################################################################
GENOMES = 'Genomes/' # File path to where your genome is located
#######################################################################
RESULTSFILEPATH = 'Results'
#######################################################################


#Global integers
WINDOWVAL = 25
THRESHOLDVAL = 2.0


# Global strings
G4HUNTER = 'G4HunterEDITED.py'


# Output stats globals
WINDOW = 'Window'
THRESHOLD = 'Threshold'
NPQS = 'Number of PQS'
BP = 'Base Pairs'
NGC = 'Number of GCs'
PGC = 'Percentage of GCs'
FRQ = 'Frequency of PQS'
SPECIES = 'Species'

# G4 Result .txt file to Database Globals
SEQ_ID = 'Sequence_ID'
START = 'Start'
END = 'End'
SEQUENCE = 'Sequence'
LENGTH = 'Length'
SCORE = 'Score'

In [3]:
# Functions
def process_txt_file(file_path):
    data = []
    current_sequence = None
    PQS = 0
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            
            # If the line starts with '>', it indicates a new sequence section
            if line.startswith('>'):
                current_sequence = line[1:]  # Remove '>' and store the sequence identifier
            elif line and not line.startswith(START):
                # Process the data lines
                parts = line.split()
                if len(parts) == 5:  # Ensures correct number of columns
                    start, end, sequence, length, score = parts
                    data.append({
                        SEQ_ID: current_sequence,
                        START: int(start),
                        END: int(end),
                        SEQUENCE: sequence,
                        LENGTH: int(length),
                        SCORE: float(score)
                    })
                elif len(parts) == 6:  # Ensures correct number of columns
                    start, end, sequence, length, score, nbr = parts
                    data.append({
                        SEQ_ID: current_sequence,
                        START: int(start),
                        END: int(end),
                        SEQUENCE: sequence,
                        LENGTH: int(length),
                        SCORE: float(score)
                    })
                    
                    PQS += int(nbr) # Save number of putative quadruplex sequences
    
    # Convert the list of dictionaries to a DataFrame
    #df = pd.DataFrame(data)
    return PQS

def runG4(inputFile):
    print('Running G4Hunter:', inputFile)
    result = subprocess.run(
    ['python3', G4HUNTER, '-i', inputFile, '-o', RESULTSFILEPATH, '-w', str(WINDOWVAL), '-s', str(THRESHOLDVAL)],
                        capture_output=True, text=True)
    return result.stdout.strip()


def overallStats(filePath):
    outputStats = {NPQS:0, BP: 0, NGC:0, PGC:0.0, FRQ: 0.0, WINDOW:WINDOWVAL, THRESHOLD:THRESHOLDVAL}
    
    # Parse all sequences in the file
    for record in SeqIO.parse(filePath, "fasta"):
        sequence = record.seq.upper()  # Convert to uppercase
        outputStats[BP] += len(sequence)
        outputStats[NGC] += sequence.count("G") + sequence.count("C")
    return outputStats

def findFastaFiles(directory):
    # Define the FASTA file extensions you want to search for
    fastaExtensions = ['*.fasta', '*.fa', '*.fna', '*.ffn', '*.faa', '*.frn']
    
    # Initialize an empty list to store the file paths
    fastaFiles = []
    
    # Iterate over each file extension pattern
    for ext in fastaExtensions:
        # Use glob to search for files matching the pattern within the directory and its subdirectories
        files = glob.glob(os.path.join(directory, '**', ext), recursive=True)
        # Append found files to the list
        fastaFiles.extend(files)
    
    return fastaFiles

def fullStop(filepath):
    # Separate the directory from the filename
    directory, filename = os.path.split(filepath)

    # Split the filename into the name and extension
    name, extension = os.path.splitext(filename)

    # Replace dots in the name part
    new_name = name.replace(".", "_")

    # Combine the modified name with the original extension
    new_filename = new_name + extension

    # Join the directory with the new filename to form the full new path
    new_file_path = os.path.join(directory, new_filename)
    os.rename(filepath, new_file_path)

    return new_file_path
    
def extractSpecies(filepath):

    # Split the filepath to get the relevant folder name
    parts = filepath.split('/')
    
    # Extract genus and species
    genus = parts[1].split('_')[0]  # Phytomonas
    
    species = parts[2]  # serpens 9T
    
    # Create the scientific name abbreviation
    scientific_name = f"{genus[0]}.{species}"

    return scientific_name



def main(filePath):
    return runG4(filePath), overallStats(filePath)


###############Main Function##############

def assassinate(filePath):
    
    # Make list of all files, iterate, produce the filepath and stats, save the overall stats 
    # in the same folder as the generated file path. 
    
    for file in findFastaFiles(filePath):
        print('Processing:', file)
        generatedFilepath, overallStats = main(file)
        if generatedFilepath=='':
            print(file, 'had issues. Renaming file')
            
            generatedFilepath, overallStats = main(fullStop(file))
            if generatedFilepath=='':
                print('COMPLETE FAILURE OF:', file)
                continue
        overallStats[NPQS] = process_txt_file(generatedFilepath)
        overallStats[PGC] = overallStats[NGC]/overallStats[BP]
        overallStats[FRQ] = overallStats[NPQS]/overallStats[BP]
        #overallStats[SPECIES] = extractSpecies(file)
        
        
        # Save output data
        directory = os.path.dirname(generatedFilepath)
        base_name = os.path.splitext(os.path.basename(generatedFilepath))[0]
        csv_file_name = f"{base_name}.csv"
        csv_file_path = os.path.join(directory, csv_file_name)
        df = pd.DataFrame([overallStats])
        df.to_csv(csv_file_path, index=False)
        print('Saved to', csv_file_path)
    print('###########################')
    print('#####Analysis Complete#####')
    print('###########################')
    return

In [4]:
assassinate(GENOMES)

Processing: TEST/Leishmania/mexicana/GCA_000234665_4_ASM23466v4_genomic.fna
Running G4Hunter: TEST/Leishmania/mexicana/GCA_000234665_4_ASM23466v4_genomic.fna
Saved to TODAYSTESTRESULTS/Results_GCA_000234665_4_ASM23466v4_genomic/GCA_000234665_4_ASM23466v4_genomic-Merged.csv
###########################
#####Analysis Complete#####
###########################
