In [1]:
import pandas as pd
from Bio.Seq import Seq

# Define the translation function
def translate_nucleotides_to_protein(nucleotide_sequence):
    try:
        return str(Seq(nucleotide_sequence).translate())
    except Exception as e:
        print(f"Error translating sequence: {nucleotide_sequence}, Error: {e}")
        return None

# Load the input CSV file
input_csv_file = "df_predictions.csv"
df = pd.read_csv(input_csv_file, dtype=str)

# Initialize lists to hold the translated sequences
labels_prot = []
predictions_prot = []

# Iterate over the rows in the DataFrame
for index, row in df.iterrows():
    label_nucleotide = row['Labels']
    prediction_nucleotide = row['Predictions']
    
    # Translate nucleotide sequences to protein sequences
    label_protein = translate_nucleotides_to_protein(label_nucleotide)
    prediction_protein = translate_nucleotides_to_protein(prediction_nucleotide)
    
    # Append the translated sequences to the lists
    labels_prot.append(label_protein)
    predictions_prot.append(prediction_protein)

# Add the new columns to the DataFrame
df['Labels_prot'] = labels_prot
df['Predictions_prot'] = predictions_prot

# Define the output CSV file
output_csv_file = "df_predictions_wt_prot.csv"

# Write the DataFrame to the new CSV file
df.to_csv(output_csv_file, index=False)

print(f"Output CSV file saved as {output_csv_file}")


Output CSV file saved as df_predictions_wt_prot.csv


In [2]:
from Bio import Align
import pandas as pd
from multiprocessing import Pool

# Load the DataFrame from the CSV file
df = pd.read_csv("df_predictions_wt_prot.csv")

# Create a PairwiseAligner with the desired parameters
aligner = Align.PairwiseAligner()
aligner.mode = 'global'  # Use global alignment mode (other modes: local, semiglobal)
aligner.open_gap_score = -5  # Adjust this value as needed
aligner.extend_gap_score = -1  # Adjust this value as needed

# Function to perform alignment and calculate scores
def align_and_calculate_score(args):
    index, row = args
    seq1 = row["Labels_prot"]
    seq2 = row["Predictions_prot"]

    try:
        # Perform pairwise alignment
        alignments = aligner.align(seq1, seq2)

        if alignments:
            # Get the first (best) alignment
            alignment = alignments[0]

            # Calculate the alignment score
            alignment_score = alignment.score

            # Calculate the identity percentage
            c = alignment.counts()
            if (c.gaps + c.identities + c.mismatches) > 0:
                identity_percentage = c.identities / (c.gaps + c.identities + c.mismatches)
            else:
                identity_percentage = 0

            return alignment_score, identity_percentage
        else:
            # No alignment found for this pair of sequences
            return None, None
    except Exception as e:
        # Handle potential exceptions and return None
        print(f"Error processing sequences at index {index}: {e}")
        return None, None

# Create a list of arguments for the function
args_list = [(index, row) for index, row in df.iterrows()]

# Create a multiprocessing pool
num_processes = 62  # Adjust as needed
with Pool(processes=num_processes) as pool:
    results = pool.map(align_and_calculate_score, args_list)

# Extract alignment scores and identity percentages from the results
alignment_scores, identity_percentages = zip(*results)

# Add new columns for alignment scores and identity percentages to the DataFrame
df["Alignment_Score"] = alignment_scores
df["Identity_Percentage"] = identity_percentages

# Save the updated DataFrame to a new CSV file
df.to_csv("df_predictions_wt_prot_alignscore.csv", index=False)
