In [27]:
from Bio.Seq import Seq
from Bio import SeqIO
import pandas as pd
import os

In [28]:
"""
Input for the code:
    $ A fasta file containing your DNA sequence that you want to edit with gRNAs (eg. pXW487, in fasta format)
    $ A text file containing:
         gRNA sequences
         PAM sequences

Output:
    $ Oligos that make the donor DNA to replace gRNA to a common placeholder
"""

SEQ_FILENAME = "pXW487.fasta"
gRNA_FILENAME = "gRNA_PAM_pXW487.txt"
Oligo_FILENAME = "Donor_oligos_2023_08_10.txt"

desktop = os.path.expanduser("~/Desktop")

seq_file_path = desktop + '/' + SEQ_FILENAME
gRNA_file_path = desktop + '/' + gRNA_FILENAME
oligo_file_path = desktop + '/' + Oligo_FILENAME

In [29]:
# Load the fasta sequence from the file
def load_fasta_sequence(file_path):
    with open(file_path, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            return record.seq.upper()

In [34]:

my_sequence = load_fasta_sequence(seq_file_path)

gRNA_df = pd.read_csv(gRNA_file_path, sep='\t')
gRNA_df["gRNA"] = gRNA_df["gRNA"].apply(lambda seq_str: Seq(seq_str.upper()))

gRNA_df

Unnamed: 0,SNP,gRNA,PAM,PAM_pos,strand
0,rs10774035,"(C, C, C, C, T, T, G, C, A, A, C, A, T, C, T, ...",CGG,82431,-
1,rs10774036,"(C, A, A, T, T, T, T, T, G, G, T, G, A, G, C, ...",TGG,100699,-
2,rs10744560,"(C, T, C, A, G, C, A, C, G, A, T, G, A, G, T, ...",TGG,100850,-
3,rs12311439,"(A, C, T, C, T, G, C, T, G, T, G, C, C, G, C, ...",TGG,108536,+
4,rs1024582,"(G, T, A, T, T, A, T, T, T, G, G, T, T, G, T, ...",GGG,116001,+
5,rs4298967,"(A, G, G, C, T, G, A, A, C, G, G, A, G, C, T, ...",GGG,121942,+


In [35]:
placeholder = Seq("CCTCTTAGACGAAGGAAGTGTGG")
placeholder_rc = placeholder.reverse_complement()

In [36]:
gRNA_df["gRNA+PAM"] = gRNA_df["gRNA"] + gRNA_df["PAM"]

In [39]:
# Lists to store the two DNA sequences for each SNP
Fwd_oligos = []
Rev_oligos = []

for index, row in gRNA_df.iterrows():
    gRNA_PAM = Seq((row["gRNA"] + row["PAM"]).replace(" ", ""))

    # Find the position of gRNA_PAM in  my_sequence
    gRNA_pam_pos = my_sequence.find(gRNA_PAM)
    gRNA_pam_pos_rc = my_sequence.find(gRNA_PAM.reverse_complement())

    if gRNA_pam_pos != -1:
        # Extract the first DNA sequence (2a)
        sequence_2a = my_sequence[gRNA_pam_pos - 37: gRNA_pam_pos] + placeholder
        Fwd_oligos.append(str(sequence_2a))

        # Extract the second DNA sequence (2b)
        downstream_start = gRNA_pam_pos + len(gRNA_PAM)
        sequence_2b = Seq(my_sequence[downstream_start: downstream_start + 37]).reverse_complement() + placeholder_rc
        Rev_oligos.append(str(sequence_2b))

    elif gRNA_pam_pos_rc != -1:
        # Extract the first DNA sequence (2a)
        sequence_2a = my_sequence[gRNA_pam_pos_rc - 37: gRNA_pam_pos_rc] + placeholder
        Fwd_oligos.append(str(sequence_2a))

        # Extract the second DNA sequence (2b)
        downstream_start = gRNA_pam_pos_rc + len(gRNA_PAM)
        sequence_2b = Seq(my_sequence[downstream_start: downstream_start + 37]).reverse_complement() + placeholder_rc
        Rev_oligos.append(str(sequence_2b))

    else:
        Fwd_oligos.append("Not found")
        Rev_oligos.append("Not found")

# Add the sequences as new columns in the DataFrame
gRNA_df["Fwd_primers"] = Fwd_oligos
gRNA_df["Rev_primers"] = Rev_oligos

try:
    gRNA_df["Fwd_primer_names"] = gRNA_df["SNP"] + "_F"
    gRNA_df["Rev_primer_names"] = gRNA_df["SNP"] + "_R"
except pd.errors.EmptyDataError:
    pass

# Print the updated DataFrame
gRNA_df.to_csv(oligo_file_path,sep="\t")
print("Data saved to", oligo_file_path)

Data saved to C:\Users\wxh02/Desktop/Donor_oligos_2023_08_10.txt
