In [1]:
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
import pandas as pd
import os
import math
import datetime

In [2]:
"""
Input: a txt file containing gRNA sequences. 
    The file can be either one column of gRNA sequences only, or the output file from 1_SNP_gRNA

Output: a txt file containing primers that assemble gRNAs into the CREEPY vector

The code allows to check for imperfect ligations and avoid those sticky end pairs
The boolean for high fidelity filter is High_Fidelity

"""

'\nInput: a txt file containing gRNA sequences. \n    The file can be either one column of gRNA sequences only, or the output file from 1_SNP_gRNA\n\nOutput: a txt file containing primers that assemble gRNAs into the CREEPY vector\n\nThe code allows to check for imperfect ligations and avoid those sticky end pairs\nThe boolean for high fidelity filter is High_Fidelity\n\n'

In [40]:
#Input filename
gRNA_FILENAME = "pXW470_gRNA_FL.txt"

#Output filename (partial)
creepy_primer_file = "pXW470_creepy_primer_" 

desktop = os.path.expanduser("~/Desktop")
gRNA_file_path = desktop + '/' + gRNA_FILENAME

# if high, filter out sticky ends that ligate with each other at >1% frequency. 
# Medium: filter out those >10%. Low: filter out those >20%
# False: turn off filter
High_Fidelity = 'ultrahigh' 

In [41]:
def read_data(file_path):
    try:
        df = pd.read_csv(file_path, sep=None)
        if "SNP" in df.columns and "gRNA" in df.columns:
            df["SNP"] = df["SNP"].astype(str)
            df["gRNA"] = df["gRNA"].str.replace(" ", "")
            df["gRNA"] = df["gRNA"].apply(lambda seq_str: Seq(seq_str.upper()))
            return df
    except pd.errors.EmptyDataError:
        pass

    try:
        df = pd.read_csv(file_path, header=None, names=["gRNA"])
        df['SNP'] = range(1, df.shape[0]+1)
        df['SNP'] = df['SNP'].astype(str)
        df["gRNA"] = df["gRNA"].str.replace(" ", "")
        df["gRNA"] = df["gRNA"].apply(lambda seq_str: Seq(seq_str.upper()))
        return df
    except FileNotFoundError:
        pass

    return None

In [42]:
gRNA_df = read_data(gRNA_file_path)
gRNA_df

  df = pd.read_csv(file_path, sep=None)


Unnamed: 0,SNP,gRNA
0,rs2239017,"(A, G, A, A, A, G, T, G, A, G, T, C, A, T, T, ..."
1,rs758171,"(A, C, C, A, T, G, C, T, G, T, G, G, C, C, A, ..."
2,rs71441679,"(A, A, G, G, C, T, G, A, G, C, A, T, C, T, G, ..."
3,rs35848523,"(T, A, T, T, C, T, G, A, T, A, G, A, T, A, A, ..."
4,rs2238049,"(T, G, A, A, G, C, C, T, C, T, A, G, G, G, C, ..."
5,rs2238050,"(G, A, A, A, A, A, T, T, A, C, T, A, G, C, C, ..."
6,rs2238051,"(C, C, G, G, T, A, G, T, A, A, A, A, A, T, T, ..."
7,rs1860056,"(G, C, C, A, C, C, A, C, T, A, A, C, A, G, C, ..."
8,rs12423277,"(T, G, G, C, T, T, C, C, T, T, C, T, A, G, A, ..."
9,rs1016388,"(T, A, C, A, C, A, G, A, G, G, A, A, A, G, A, ..."


In [43]:
def creepy_primer(df, hf="Medium", gc_check=True):
    """
    Design CREEPY primers based on a list of gRNAs
    For one guide RNA, return two oligos that generate sticky ends after annealing.
    For two guide RNAs, return two primers, with each primer incorperating one gRNA.
    For three and more guide RNAs, return a set of primers:
        The first and last primers introduce the first and last gRNAs, respectively.
        The rest of the primers introduce the middle gRNAs that are each split into two halves,
        with the first half in the reverse primer and the second half in the fwd primer.

    :param 
    df: dataframe containing the SNPs and gRNAs
    hf: High-fidelity boolean. default is medium.
        False: do not check ligation fidelity dictionary
        low: low fidelity, only avoid highly frequent misligations (freq>20%)
        medium: medium fidelity, avoid misligation freq>10%
        high: high fidelity, avoid misligation freq>1%
        ultrahigh: ultrahigh fidelity, avoid misligation freq>0.1%
    gc_check: if True, check GC content to make sure it's not 0 or 100%

    :return: 
    a dictionary of primers, and a dictionary of sticky end sequences
    """
    esp3i_adaptor = Seq('aCGTCTCc')   #universal adaptor for Esp3I, will generate a sticky end that matches the 4 following nt
    sticky_end_left = Seq('GACT')    #left sticky end of pXW467/468/472
    sticky_end_right = Seq('AAAC')   #right sticky end of pXW67/468/472
    template_fwd = Seq('GTTTTAGAGCTAGAAATAGCAAGTTA') #Fwd primer to amplify scaffold RNA
    template_rev = Seq('TGCGCAAGCCCGGAATCGAACCGGG')  #Rev primer to amplify tRNA_Gly
    adaptor_left = esp3i_adaptor + sticky_end_left + "tt" #left adaptor for Esp3I to generate a GACT sticky end followed by tt
    adaptor_right = esp3i_adaptor + sticky_end_right  #Right adaptor for Esp3I to generate a AAAC sticky end
    primer_dict = {}
    overhang_dict = {"vect_left": str(sticky_end_left) + "\tNA", "vect_right": str(sticky_end_right) + "\tNA"}
    overhang_fidelity_dict = {}

    if df.shape[0] == 1:
        # For one gRNA only
        primer1_name, primer1 = df['SNP'].iloc[0] + '_F', sticky_end_left + 'tt' + df['gRNA'][0]
        primer2_name, primer2 = df['SNP'].iloc[0] + '_R', sticky_end_right + df['gRNA'][0].reverse_complement() + 'aa'
        primer_dict = {primer1_name: primer1, primer2_name: primer2}
        overhang_fidelity_dict["vect_left"], overhang_fidelity_dict["vect_right"] = "NA", "NA"

    elif df.shape[0] == 2:
        # For two gRNAs
        primer1_name, primer1 = df['SNP'].iloc[0] + '_F', adaptor_left + df['gRNA'][0] + template_fwd
        primer2_name, primer2 = df['SNP'].iloc[1] + '_R', adaptor_right + df['gRNA'][1].reverse_complement() + template_rev
        primer_dict = {primer1_name: primer1, primer2_name: primer2}
        overhang_fidelity_dict["vect_left"], overhang_fidelity_dict["vect_right"] = "NA", "NA"

    else:
        # For three and more gRNAs (1st and last are introduced in one primer, all others are split into two primers and joined by golden gate
        overhang_start_list = []
        primer1_name = df['SNP'].iloc[0] + '_F'
        primer_dict[primer1_name] = adaptor_left + df['gRNA'][0] + template_fwd
        overhang_list = [str(sticky_end_left), str(sticky_end_right)]

        if hf:
            ligation_fidelity_dict = "Ligation_fidelity_dictionary.json"
            hf_combined = pd.read_json(ligation_fidelity_dict)
            hf_df = hf_combined[hf.lower()]

            try: overhang_fidelity_dict = {"vect_left": hf_df[sticky_end_left], "vect_right": hf_df[sticky_end_right]}
            except KeyError:
                pass

            try: overhang_list.extend(hf_df[sticky_end_left])
            except KeyError:
                pass

            try: overhang_list.extend(hf_df[sticky_end_right])
            except KeyError:
                pass            

            for n in range(1,len(df['gRNA'])-1):
                gRNA_seq = df['gRNA'][n]
                overhang_start = 8
                overhang = gRNA_seq[overhang_start:overhang_start + 4]
                gc_too_hi_or_lo = gc_fraction(Seq(overhang)) == 100 or gc_fraction(Seq(overhang)) == 0
                
                i=1

                while (overhang.reverse_complement() == overhang\
                       or overhang in overhang_list\
                       or overhang.reverse_complement() in overhang_list\
                       or (gc_check and gc_too_hi_or_lo)): #check if it's palindromic, or exists in list, GC content is high or low
                    overhang_start = 8 + math.ceil(i/2) * (-1) ** i
                    overhang = gRNA_seq[overhang_start:overhang_start+4]
                    gc_too_hi_or_lo = gc_fraction(Seq(overhang)) == 100 or gc_fraction(Seq(overhang)) == 0
                    
                    i = i + 1

                try: overhang_misligate = hf_df[overhang]
                except KeyError:
                    pass   
                
                try: overhang_rc_misligate = hf_df[overhang.reverse_complement()]
                except KeyError:
                    pass
            
                overhang_list.append(str(overhang))
                overhang_list.extend(overhang_misligate)
                overhang_list.extend(overhang_rc_misligate)
                
                overhang_dict[df['SNP'][n]] = f"{str(overhang)}\t{overhang_start+1}"
                p_r_name = df['SNP'].iloc[n] + "_aR"
                p_f_name = df['SNP'].iloc[n] + "_bF"
                primer_dict[p_r_name] = esp3i_adaptor + gRNA_seq[:overhang_start+4].reverse_complement() + template_rev
                primer_dict[p_f_name] = esp3i_adaptor + gRNA_seq[overhang_start:] + template_fwd

                overhang_fidelity_dict[df['SNP'][n]] = overhang_misligate

        else:
            for n in range(1,len(df['gRNA'])-1):
                gRNA_seq = df['gRNA'][n]
                overhang_start = 8
                overhang = gRNA_seq[overhang_start:overhang_start + 4]
                gc_too_hi_or_lo = gc_fraction(Seq(overhang)) == 100 or gc_fraction(Seq(overhang)) == 0
                i=1

                while overhang.reverse_complement() == overhang \
                        or overhang in overhang_list \
                        or overhang.reverse_complement() in overhang_list\
                        or (gc_check and gc_too_hi_or_lo):  #check if it's palindromic, or exists in list, GC content is high or low
                    overhang_start = 8 + math.ceil(i/2) * (-1) ** i
                    overhang = gRNA_seq[overhang_start:overhang_start+4]
                    gc_too_hi_or_lo = gc_fraction(Seq(overhang)) == 100 or gc_fraction(Seq(overhang)) == 0                  
                    i = i + 1
                overhang_list.append(str(overhang))

                overhang_dict[df['SNP'][n]] = f"{str(overhang)}\t{overhang_start+1}"
                p_r_name = df['SNP'].iloc[n] + "_aR"
                p_f_name = df['SNP'].iloc[n] + "_bF"
                primer_dict[p_r_name] = esp3i_adaptor + gRNA_seq[:overhang_start+4].reverse_complement() + template_rev
                primer_dict[p_f_name] = esp3i_adaptor + gRNA_seq[overhang_start:] + template_fwd
                overhang_fidelity_dict[df['SNP'][n]] = "NA"
            overhang_fidelity_dict["vect_left"], overhang_fidelity_dict["vect_right"] = "NA", "NA"


        # overhang_fidelity_dict["vect_left"], overhang_fidelity_dict["vect_right"] = "NA", "NA"

        primer_last_name = df['SNP'].iloc[-1] + "_R"
        primer_dict[primer_last_name] = adaptor_right + df['gRNA'].iloc[-1].reverse_complement() + template_rev
        print(overhang_list)

    return  primer_dict, overhang_dict, overhang_fidelity_dict


In [44]:
primer_set, sticky_end_dict, fidelity_dict = creepy_primer(gRNA_df,hf=High_Fidelity, gc_check=True)

current_datetime = datetime.datetime.now()
date_time_str = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")

creepy_primer_path = desktop + '/' + creepy_primer_file + date_time_str + ".txt"

with open(creepy_primer_path, "w") as file:
    file.write('Primer\t\tSequence\n')
    for key, value in primer_set.items():
        file.write(f"{key}\t{value}\n")
    file.write('\nSNP\t\tStk_end\tPos\tIncompatible\n')
    for key, value in sticky_end_dict.items():
        file.write(f"{key}\t{value}\t{fidelity_dict[key]}\n")


['GACT', 'AAAC', 'GGTT', 'AGTT', 'AGCT', 'AGTG', 'ATTC', 'TGTC', 'GGTC', 'CGTC', 'GATC', 'AATC', 'TGGC', 'AGGC', 'AGCC', 'AGAC', 'AGTA', 'GGTT', 'GTGT', 'GTCT', 'GTAT', 'GTTG', 'GTTC', 'GTTA', 'GTGG', 'CCTT', 'CCGT', 'TCCT', 'CCCT', 'TCAT', 'CCAT', 'ACAT', 'CCGG', 'CCAG', 'CCTC', 'CTGC', 'TCGC', 'CCGC', 'TCCC', 'CCCC', 'TTAC', 'CTAC', 'CGAC', 'TCAC', 'GCAC', 'ACAC', 'CAAC', 'CCGA', 'CCAA', 'GTGT', 'GTTG', 'TTGG', 'ATGG', 'GGGG', 'GCGG', 'GAGG', 'GTCG', 'GTAG', 'GTGC', 'GTGA', 'GCAT', 'GTGT', 'ATGT', 'ATGG', 'ATTC', 'TTGC', 'GTGC', 'CTGC', 'TGGC', 'AGGC', 'GCGC', 'ACGC', 'TAGC', 'AAGC', 'ATAC', 'GTGA', 'ATGA', 'GCTT', 'GCGT', 'TGCT', 'GGCT', 'AGCT', 'GTAT', 'GGAT', 'TCAT', 'ACAT', 'GAAT', 'GCTG', 'GCAG', 'GCAC', 'GCTA', 'GCAA', 'TAGA', 'TCTT', 'TCTG', 'TCTC', 'GCTA', 'TAGT', 'TAGG', 'TTGA', 'ATGA', 'TGGA', 'GAGA', 'CTCT', 'AGAT', 'TGTG', 'GGTG', 'AGTG', 'GGGG', 'AGGG', 'TGCG', 'AGCG', 'ATAG', 'TGAG', 'GGAG', 'CGAG', 'AAAG', 'CTTT', 'CTGT', 'TTCT', 'GTCT', 'ATCT', 'CGCT', 'CACT', 'CTAT',

In [45]:
overhang_list = pd.read_csv(desktop + '/overhang_list.txt',header=None)
overhang_list

Unnamed: 0,0
0,GACT
1,AAAC
2,GTGG
3,GCAT
4,TAGA
5,TAGG
6,ACTA
7,GTAA
8,TAAC
9,TTCT
