In [311]:
from Bio.Seq import Seq
import pandas as pd
import os
import math

In [79]:
"""
Input: a txt file containing gRNA sequences. The file can either be one column of gRNA sequences only, or the output file from 1_SNP_gRNA

Output: a txt file containing primers that assemble gRNAs into the CREEPY vector
"""

'\nInput: a txt file containing gRNA sequences. The file can either be gRNA sequences only, or the output file from 1_SNP_gRNA\n\nOutput: a txt file containing primers that assemble gRNAs into the CREEPY vector '

In [389]:
gRNA_FILENAME = 'gRNA_PAM_pXW487_3.txt'
desktop = os.path.expanduser("~/Desktop")
gRNA_file_path = desktop + '/' + gRNA_FILENAME

In [399]:
# gRNA_df = pd.read_csv(gRNA_file_path, sep='\t', header=None)
# gRNA_df
def read_data(file_path):
    try:
        df = pd.read_csv(file_path, sep="\t")
        if "SNP" in df.columns and "gRNA" in df.columns:
            df["SNP"] = df["SNP"].astype(str)
            df["gRNA"] = df["gRNA"].str.replace(" ", "")
            df["gRNA"] = df["gRNA"].apply(lambda seq_str: Seq(seq_str.upper()))
            return df
    except pd.errors.EmptyDataError:
        pass

    try:
        df = pd.read_csv(file_path, header=None, names=["gRNA"])
        df['SNP'] = range(1, df.shape[0]+1)
        df['SNP'] = df['SNP'].astype(str)
        df["gRNA"] = df["gRNA"].str.replace(" ", "")
        df["gRNA"] = df["gRNA"].apply(lambda seq_str: Seq(seq_str.upper()))
        return df
    except FileNotFoundError:
        pass

    return None

In [400]:
gRNA_df = read_data(gRNA_file_path)
gRNA_df

Unnamed: 0,gRNA,SNP
0,"(C, C, C, C, T, T, G, C, A, A, C, A, T, C, T, ...",1
1,"(C, A, A, T, T, T, T, T, G, G, T, G, A, G, C, ...",2
2,"(C, T, C, A, G, C, A, C, G, G, T, G, A, G, T, ...",3
3,"(A, C, T, C, T, G, C, T, G, T, G, C, C, G, C, ...",4
4,"(G, T, A, T, T, A, T, T, T, G, G, T, T, G, T, ...",5


In [401]:
def creepy_primer(df):
    """
    Design CREEPY primers based on a list of gRNAs
    For one guide RNA, return two oligos that generate sticky ends after annealing.
    For two guide RNAs, return two primers, with each primer incorperating one gRNA.
    For three and more guide RNAs, return a set of primers:
        The first and last primers introduce the first and last gRNAs, respectively.
        The rest of the primers introduce the middle gRNAs that are each split into two halves,
        with the first half in the reverse primer and the second half in the fwd primer.
    :param df:
    :return: a dictionary of primers, and a dictionary of sticky end sequences
    """
    sticky_end_left = Seq('GACT')    #left sticky end of pXW467/468/472
    sticky_end_right = Seq('AAAC')   #right sticky end of pXW67/468/472
    template_fwd = Seq('GTTTTAGAGCTAGAAATAGCAAGTTA') #Fwd primer to amplify scaffold RNA
    template_rev = Seq('TGCGCAAGCCCGGAATCGAACCGGG')  #Rev primer to amplify tRNA_Gly
    adaptor_left = Seq('aCGTCTCagacttt') #left adaptor for Esp3I to generate a GACT sticky end followed by tt
    adaptor_right = Seq('aCGTCTCcaaac')  #Right adaptor for Esp3I to generate a AAAC sticky end
    adaptor3 = Seq('aCGTCTCc')   #universal adaptor for Esp3I, will generate a sticky end that matches the 4 following nt
    overhang_set = {"left": sticky_end_left, "right": sticky_end_right}
    
    if df.shape[0] == 1:
        # For one gRNA only
        primer1_name, primer1 = df['SNP'].iloc[0] + '_F', sticky_end_left + 'tt' + df['gRNA'][0]
        primer2_name, primer2 = df['SNP'].iloc[0] + '_R', sticky_end_right + df['gRNA'][0].reverse_complement() + 'aa'
        p_set = {primer1_name: primer1, primer2_name: primer2}

    elif df.shape[0] == 2:
        # For two gRNAs
        primer1_name, primer1 = df['SNP'].iloc[0] + '_F', adaptor_left + df['gRNA'][0] + template_fwd
        primer2_name, primer2 = df['SNP'].iloc[1] + '_R', adaptor_right + df['gRNA'][1].reverse_complement() + template_rev
        p_set = {primer1_name: primer1, primer2_name: primer2}

    else:
        # For three and more gRNAs (1st and last are introduced in one primer, all others are split into two primers and joined by golden gate
        overhang_start_list = []
        p_set = {}
        primer1_name = df['SNP'].iloc[0] + '_F'
        p_set[primer1_name] = adaptor_left + df['gRNA'][0] + template_fwd
        overhang_list = list(overhang_set.values())

        for n in range(1,len(df['gRNA'])-1):
            gRNA_seq = df['gRNA'][n]#.upper()
            overhang_start = 8
            overhang = gRNA_seq[overhang_start:overhang_start + 4]
            i=1
            while overhang.reverse_complement() == overhang or overhang in overhang_list or overhang.reverse_complement() in overhang_list:
                overhang_start = 8 + math.ceil(i/2) * (-1) ** i
                overhang = gRNA_seq[overhang_start:overhang_start+4]
                i = i + 1
                
            overhang_set[df['SNP'][n]] = f"{str(overhang)} start_pos: ({overhang_start+1})"
            overhang_list.append(overhang)
            p_r_name = df['SNP'].iloc[n] + "_aR"
            p_f_name = df['SNP'].iloc[n] + "_bF"
            p_set[p_r_name] = adaptor3 + gRNA_seq[:overhang_start+4].reverse_complement() + template_rev
            p_set[p_f_name] = adaptor3 + gRNA_seq[overhang_start:] + template_fwd

        primer_last_name = df['SNP'].iloc[-1] + "_R"
        p_set[primer_last_name] = adaptor_right + df['gRNA'].iloc[-1].reverse_complement() + template_rev
        print(overhang_list)
    return  p_set, overhang_set


In [402]:
primer_set, sticky_end_set = creepy_primer(gRNA_df)
creepy_primer_file = gRNA_file_path.replace('gRNA', 'creepy_primers_new')

with open(creepy_primer_file, "w") as file:
    for key, value in primer_set.items():
        file.write(f"{key}:\t{value}\n")
    file.write('\n\nSticky ends:\n\n')
    for key, value in sticky_end_set.items():
        file.write(f"{key}:\t{value}\n")    


[Seq('GACT'), Seq('AAAC'), Seq('GGTG'), Seq('CGGT'), Seq('GTGC')]


In [343]:
math.ceil(1/2)

1