In [None]:
import pandas as pd

THRESHOLD = 60

# Define a function to process a cell, taking into account empty cells or cells with only whitespace
def process_cell(cell):
    # Check for empty cells or cells with only whitespace
    if cell.strip() == '':
        return '*'
    
    # Split the cell into nucleotide-percentage pairs, and filter out any empty strings
    pairs = [pair for pair in cell.split(';') if pair]
    
    # For each pair, split into nucleotide and percentage
    for pair in pairs:
        nucleotide, percentage = pair.strip().split()
        # If the percentage is over or equal to 60%, return the nucleotide
        if float(percentage.rstrip('%')) >= THRESHOLD:
            return nucleotide
    
    # If no nucleotide has a percentage over or equal to 60%, return 'N'
    return 'N'

# Load the CSV file

df = pd.read_csv('./output_fasta/refined_primer256/common_lib/DOG_output.csv')

# Select all columns except the first one (which contains the text "Consensus")
data_columns = df.columns[1:]

# Initialize the final sequence
final_sequence = []

# Iterate through the selected columns
for col in data_columns:
    # Get the cell value
    cell_value = df[col].iloc[0]
    
    # Process the cell value using the process_cell function
    nucleotide = process_cell(cell_value)
    
    # Append the nucleotide to the final sequence
    final_sequence.append(nucleotide)

# Join the final sequence into a string
final_sequence = ''.join(final_sequence)

print(final_sequence, len(final_sequence)-1)


In [24]:
'''Find the most probable sequence'''
# Define a function to process a cell and return the nucleotide(s) with the highest percentage
def process_cell_iupac(cell):
    # Check for empty cells or cells with only whitespace
    if cell.strip() == '':
        return '*'
    
    # Split the cell into nucleotide-percentage pairs, and filter out any empty strings
    pairs = [pair for pair in cell.split(';') if pair]
    
    # Initialize a dictionary to store the percentages for each nucleotide
    percentages = {}
    
    # For each pair, split into nucleotide and percentage and store in the dictionary
    for pair in pairs:
        nucleotide, percentage = pair.strip().split()
        percentages[nucleotide] = float(percentage.rstrip('%'))
    
    # Find the maximum percentage
    max_percentage = max(percentages.values())
    
    # Find the nucleotide(s) with the maximum percentage
    max_nucleotides = [nucleotide for nucleotide, percentage in percentages.items() if percentage == max_percentage]
    
    # Define the IUPAC ambiguity codes
    iupac_codes = {
        frozenset(['A', 'G']): 'R',
        frozenset(['C', 'T']): 'Y',
        frozenset(['G', 'C']): 'S',
        frozenset(['A', 'T']): 'W',
        frozenset(['G', 'T']): 'K',
        frozenset(['A', 'C']): 'M'
    }
    
    # If there is only one nucleotide with the maximum percentage, return it
    if len(max_nucleotides) == 1:
        return max_nucleotides[0]
    # If there are two nucleotides with the maximum percentage, return the corresponding IUPAC code
    elif len(max_nucleotides) == 2:
        return iupac_codes[frozenset(max_nucleotides)]
    # If there are more than two nucleotides with the maximum percentage, return 'N'
    else:
        return 'N'

# Apply the process_cell_iupac function to each selected column
most_probable_sequence = ''.join(df[col].apply(process_cell_iupac).iloc[0] for col in data_columns)

print(most_probable_sequence, len(most_probable_sequence)-1)


GGGACGACGTGTTTTTTCCAAAGACGGAGTKSGTCNTCSCCGTCCC* 46
