In [1]:
import itertools
import pandas as pd
from jellyfish import hamming_distance


# Parameters
dinucList = [''.join(p) for p in itertools.product(['A', 'C', 'G', 'T'], repeat=2)]
tetranucList = [''.join(p) for p in itertools.product(['A', 'C', 'G', 'T'], repeat=4)]
primer = "CGCACACATACACATACACACATACACATACGCAC"
# Check that primer input is 35bp
if len(primer) != 35:
    raise Value_Error("Primer is not 35bp long!")

    
def dinucMutation(seq, r, description, dinucList = dinucList, cons = False):
    """
    Given a sequence, seq, generate all possible dinucleotide mutations
    1. seq = sequence to mutate
    2. r = range in sequence to mutate, given as a list or tuple
     ex/ r = [0, 5] or (4,9)
    3. dinucList = a precomputed list of all dinucleotides. 
    4. cons = consolodate. Argument to consolodate matching mutations.
    """
    # Generate all sequences with mutations at given ranges
    mutList, mutDescList = [], []
    for i in range(r[0], r[1]):
        for mut in dinucList:
            mutation = seq[:i] + mut + seq[i + 2:]
            mutDesc = f"DN_{description}_pos{i}->{mut}"
            mutList.append(mutation)
            mutDescList.append(mutDesc)
    mutDF = pd.DataFrame({"Sequence":mutList, "Description":mutDescList})
    # Filter out mutations that match the original
    mutDF = mutDF.query("Sequence != @seq").reset_index(drop=True)
    # Consolodate equivalent mutations, if cons argument is True
    if cons == True:
        mutList, descList  = [], []
        for mutSeq, group in mutDF.groupby(by="Sequence"):
            consDesc = '-'.join(group["Description"])
            mutList.append(mutSeq)
            descList.append(consDesc)
        mutDF = pd.DataFrame({"Sequence":mutList, "Description":descList})
    # Append the original sequence
    mutDF = mutDF.append({"Sequence":seq, "Description":f"DN_{desc}_posW->WT"}, ignore_index = True)
    return(mutDF)

def diFlankMutation(seq, coreRange, description, tetranucList = tetranucList):
    """
    Given a sequence, seq, generate all possible dinucleotide flanks
    1. seq = sequence to mutate
    2. coreRange = range in sequence to mutate, given as a list or tuple
     ex/ r = [0, 5] or (4,9)
    3. dinucList = a precomputed list of all dinucleotides. 
    4. cons = consolodate. Argument to consolodate matching mutations.    
    """
    mutList, mutDescList = [], []
    cFlankLeft = seq[0:coreRange[0] - 2]
    cFlankRight = seq[coreRange[1] + 2:]
    core = seq[coreRange[0]:coreRange[1]]
    for tetramer in tetranucList:
        mut = cFlankLeft + tetramer[:2] + core + tetramer[2:] + cFlankRight
        mutList.append(mut)
        mutDescList.append(f"Flank_{description}_{tetramer}")
    return(pd.DataFrame({"Sequence":mutList, "Description":mutDescList}))


def addReps(df, nReps):
    """
    Adds replicates to a dataframe of sequences
    df = Pandas DataFrame of sequences to generate replicates of
     must have a "Sequence" and "Description" column
    nReps = number of replicates, given as an integer
    """
    seqList, desList = [], []
    for seq, des in zip(df["Sequence"], df["Description"]):
        for i in range(nReps):
            seqList.append(seq)
            desList.append(f"{des}_r{i}")
    return(pd.DataFrame({"Sequence":seqList, "Description":desList}))

def sumScore(seq, k, kScoreDict):
    score = 0
    for i in range(len(seq) - k + 1):
        score = score + kScoreDict[seq[i:i+k]]
    return(score)

def scoreClusters(df, kDict):
    df["Score"] = df["Sequence"].apply(lambda x: sumScore(x, 6, kDict))
    df = df.sort_values(by = "Score", ascending = False)
    df = df.drop_duplicates(subset="Cluster")
    return(df)

def closestSubHamming(string, query):
    qLen = len(query)
    minHam = qLen
    for i in range(len(string) - qLen + 1):
        ham = hamming_distance(query, string[i:i+qLen])
        if ham < minHam:
            minHam = ham
    return(minHam)

def revComp(sequence):
    """
    Input: String of DNA sequences in any case
    Output: Reverse Complement in upper case
    """
    # Define dictionary, make input string upper case
    rcDict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
    seqUp = sequence.upper()
    rcSeq = ''
    for letter in seqUp:
        if letter not in rcDict:
            raise ValueError('Error: Non-[A,C,G,T,N] found in string')
        else:
            rcSeq = rcSeq + rcDict[letter]
    return(rcSeq[::-1])

def revCompDF(df):
    dfOut = df.copy(deep = True)
    seqList, descList  = [], []
    for seq, desc in zip(dfOut["Sequence"], dfOut["Description"]):
        seqList.append(revComp(seq))
        descList.append(f"{desc}_O2")
    dfOut["Sequence"] = seqList
    dfOut["Description"] = descList
    return(dfOut)

def removeRepString(string):
    strList = string.split("_")[:-1]
    return("_".join(strList))

### Dinculeotide Mutations

From an input csv file, dinucleotide mutations are generated in all possible mutations across a defined mutation substring. 

In [2]:
# Read file and ensure sequences are upper case
designDiFile = "/Users/zmielko/Desktop/UV_TBP_MITF_EGR1_CREB1_Array/Sequence_Designs/Dinucleotide_Mutations.csv"
designDiDF = pd.read_csv(designDiFile)
designDiDF["Sequence"] = designDiDF["Sequence"].apply(lambda x: x.upper())

# Generate Dataframes for each sequence, concatinate them
diMutDFList = []
for desc, seq, mutSubStr in zip(designDiDF["Description"],designDiDF["Sequence"],
                                designDiDF["Mutation_Substring"]):
    rStart = seq.find(mutSubStr)
    rEnd = rStart + len(mutSubStr)
    diMutDFList.append(dinucMutation(seq, [rStart, rEnd],desc, cons=True))
diMutDF = pd.concat(diMutDFList)
# Add Primer to the sequences
diMutDF["Sequence"] = diMutDF["Sequence"].apply(lambda x: x + primer)
# Make 20 replicates for each sequence
diMutDF = addReps(diMutDF, 20)
diMutDF

Unnamed: 0,Sequence,Description
0,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r0
1,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r1
2,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r2
3,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r3
4,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r4
5,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r5
6,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r6
7,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r7
8,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r8
9,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r9


### Comprehensive 2bp flanks

All 4-mers are generated and split in half. Given a core sequence which is a substring of the full 25mer, the 2bp flanks are replaced with each half of the 4-mer. 

In [3]:
designFlankFile = "/Users/zmielko/Desktop/UV_TBP_MITF_EGR1_CREB1_Array/Sequence_Designs/Flanks.csv"
designFlankDF = pd.read_csv(designFlankFile)
designFlankDF["Sequence"] = designFlankDF["Sequence"].apply(lambda x: x.upper())
designFlankDF

Unnamed: 0,Description,Sequence,Core_Sequence
0,TBP_TATATATA,GTACGTACGTATATATACGTACGTA,TATATATA
1,MITF_CACGTG,GTATGTACGCACGTGCGTACATACG,CACGTG
2,EGR1_CGCACACGC,ACGTATGCACGCACACGCGTATGTA,CGCACACGC
3,EGR1_GCGTGTGCG,ACATACGCGTGTGCGTGCATACGTA,GCGTGTGCG


In [4]:
# Generate Dataframes for each sequence, concatinate them
flankDFList = []
for desc, seq, coreSubStr in zip(designFlankDF["Description"],designFlankDF["Sequence"],
                                        designFlankDF["Core_Sequence"]):
    rStart = seq.find(coreSubStr)
    rEnd = rStart + len(coreSubStr)
    flankDFList.append(diFlankMutation(seq, [rStart, rEnd], desc))
flankDF = pd.concat(flankDFList)
# Add Primer to the sequences
flankDF["Sequence"] = flankDF["Sequence"].apply(lambda x: x + primer)
# Make 20 replicates for each sequence
flankDF = addReps(flankDF, 20)
flankDF

Unnamed: 0,Sequence,Description
0,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r0
1,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r1
2,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r2
3,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r3
4,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r4
5,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r5
6,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r6
7,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r7
8,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r8
9,GTACGTAAATATATATAAATACGTACGCACACATACACATACACAC...,Flank_TBP_TATATATA_AAAA_r9


### de Bruijn Sequence

Given the array design for the 8x60k UV Crystal array, the de Bruijn sequences for all 9-mers and all 8-mers are extracted and added to the array. 

In [5]:
# Read in a previous array design with the de Bruijn sequences and same primer
arrayDesignFile = "../Sequence_Designs/8x60k_UV_Array_Design.txt"
arrayDesignDF = pd.read_csv(arrayDesignFile, sep = '\t')
# Filter the dataframe for the de Bruijn sequences
arrayDesignDF = arrayDesignDF.dropna(subset=["SEQUENCE"])
deBruDF = arrayDesignDF[arrayDesignDF["NAME"].str.startswith("All")].copy(deep = True)
deBruDF = deBruDF[["SEQUENCE", "NAME"]]
deBruDF = deBruDF.rename(columns={"SEQUENCE":"Sequence", "NAME":"Description"})
deBruDF 

Unnamed: 0,Sequence,Description
4,GTATGCACACGTCCAGCTTCGCGCACGCACACATACACATACACAC...,All_9mer_14bp_19069
5,TATGCGGCTTGCCAATGAGTGCGCACGCACACATACACATACACAC...,All_9mer_14bp_15243
6,TATGCAAAGTATACCCTCGTGCGCACGCACACATACACATACACAC...,All_9mer_14bp_14947
7,GTATGCAGCGTTCAGACTGTGCGCACGCACACATACACATACACAC...,All_9mer_14bp_6572
8,TATGCACACCAACATCGCTTGCGCACGCACACATACACATACACAC...,All_9mer_14bp_8992
9,GTATGCACGTGTTACGACATGCGCACGCACACATACACATACACAC...,All_8mer_13bp_8296
10,ATGCACTTGGGCCCTCTAATGCGCACGCACACATACACATACACAC...,All_9mer_14bp_18910
11,TATGCGGGGTCCTCGTCCGTGCGCACGCACACATACACATACACAC...,All_9mer_14bp_36064
13,TATGCAACGATGTGAGTGGTGCGCACGCACACATACACATACACAC...,All_9mer_14bp_21123
15,GTATGCAACTAATTCTTGGTGCGCACGCACACATACACATACACAC...,All_9mer_14bp_11017


### kD Calibration Sequence

Given a csv file of sequences with previous kD measurements from literature, the sequences are added to a DF with replicates. 

In [6]:
kDFile = "../Sequence_Designs/kD.csv"
kDDF = pd.read_csv(kDFile)
kDDF["Sequence"] = kDDF["Sequence"].apply(lambda x: x + primer)
kDDF = addReps(kDDF, 20)
kDDF

Unnamed: 0,Sequence,Description
0,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r0
1,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r1
2,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r2
3,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r3
4,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r4
5,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r5
6,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r6
7,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r7
8,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r8
9,TATATAGCGTGGGCGTATATATCCGCGCACACATACACATACACAC...,Zif268_consensus_r9


### Clustered Differential K-mer Probes

Takes generated differential probes that have been clustered together and takes the top probe from each k-mer based on the sum of differences between UV and Watson-Crick conditions of 6-mers. 

Sequences are generated using the differentialKmer.ipynb script. Clusters are made using the affinityProp.py script which has a wrapper for parallel processing using affinityPropParallel.sh. 



In [7]:
clustTuple = [("EGR1", "../Data/EGR1_WLS/EGR_kmers.txt",
               "../Data/diffUVProbes/EGR1_25mers_cluster.tsv", "../Data/diffUVProbes/EGR1_extShort_cluster.tsv"),
             ("CREB1","../Data/CREB1_WLS/CREB_kmers.txt",
              "../Data/diffUVProbes/CREB1_25mers_cluster.tsv", "../Data/diffUVProbes/CREB1_extShort_cluster.tsv"),
             ("TBP", "../Data/TBP_WLS/TBP_kmers.txt",
              "../Data/diffUVProbes/TBP_25mers_cluster.tsv", "../Data/diffUVProbes/TBP_extShort_cluster.tsv")]

clusterDFList = []
for tup in clustTuple:
    kmerDF = pd.read_csv(tup[1], sep = '\t')
    clust25DF = pd.read_csv(tup[2], sep = '\t')
    clustShortDF = pd.read_csv(tup[3], sep = '\t')
    kmerDF["Diff"] = kmerDF["Escore_6_9_UV"] - kmerDF["Escore_6_8"]
    kDiffDict = dict(zip(kmerDF["kmerFwd"], kmerDF["Diff"]))
    kDiffDict.update(dict(zip(kmerDF["kmerRC"], kmerDF["Diff"])))
    top25SeqPerCluster = scoreClusters(clust25DF, kDiffDict)
    top25SeqPerCluster["Description"] = top25SeqPerCluster["Cluster"].apply(lambda x: f"{tup[0]}_C{x}_TD25")
    #top25SeqPerCluster = addReps(top25SeqPerCluster, 12)
    clusterDFList.append(top25SeqPerCluster)
    topShortSeqPerCluster =scoreClusters(clustShortDF, kDiffDict)
    topShortSeqPerCluster["Description"] = topShortSeqPerCluster["Cluster"].apply(lambda x: f"{tup[0]}_C{x}_TDEXT")
    #topShortSeqPerCluster = addReps(topShortSeqPerCluster, 12)
    clusterDFList.append(topShortSeqPerCluster)
    
clusterProbeDF = pd.concat(clusterDFList)
clusterProbeDF = clusterProbeDF.reset_index(drop = True)
clusterProbeRCDF = revCompDF(clusterProbeDF)
clusterProbeDF["Sequence"] = clusterProbeDF["Sequence"].apply(lambda x: x + primer)
clusterProbeRCDF["Sequence"] = clusterProbeRCDF["Sequence"].apply(lambda x: x + primer)
clusterProbeDF = addReps(clusterProbeDF, 6)
clusterProbeRCDF = addReps(clusterProbeRCDF, 6)
clusterProbeDF

Unnamed: 0,Sequence,Description
0,GATTAATTTGAATTTAAATTCAATTCGCACACATACACATACACAC...,EGR1_C422_TD25_r0
1,GATTAATTTGAATTTAAATTCAATTCGCACACATACACATACACAC...,EGR1_C422_TD25_r1
2,GATTAATTTGAATTTAAATTCAATTCGCACACATACACATACACAC...,EGR1_C422_TD25_r2
3,GATTAATTTGAATTTAAATTCAATTCGCACACATACACATACACAC...,EGR1_C422_TD25_r3
4,GATTAATTTGAATTTAAATTCAATTCGCACACATACACATACACAC...,EGR1_C422_TD25_r4
5,GATTAATTTGAATTTAAATTCAATTCGCACACATACACATACACAC...,EGR1_C422_TD25_r5
6,GATGATTAATTTCAATTTAAATTCACGCACACATACACATACACAC...,EGR1_C405_TD25_r0
7,GATGATTAATTTCAATTTAAATTCACGCACACATACACATACACAC...,EGR1_C405_TD25_r1
8,GATGATTAATTTCAATTTAAATTCACGCACACATACACATACACAC...,EGR1_C405_TD25_r2
9,GATGATTAATTTCAATTTAAATTCACGCACACATACACATACACAC...,EGR1_C405_TD25_r3


### Hamming Distance from RelA

Generated differential probes are scored based on the lowest hamming distance between the RelA site: GGAAATTCCC and each 10-mer in the probe. The 50 probes with the closest substring were added to the array. 

* The 50 choice is arbitrary as there could be many sequences with the same distance as the 50th closest. The dataframe was simply sorted by distance and the sort order from the pandas method .sort_values(by="hDist") was used. 

In [8]:
clustTuple = [("EGR1", "../Data/EGR1_WLS/EGR_kmers.txt",
               "../Data/diffUVProbes/EGR1_25mers_cluster.tsv", "../Data/diffUVProbes/EGR1_extShort_cluster.tsv"),
             ("CREB1","../Data/CREB1_WLS/CREB_kmers.txt",
              "../Data/diffUVProbes/CREB1_25mers_cluster.tsv", "../Data/diffUVProbes/CREB1_extShort_cluster.tsv"),
             ("TBP", "../Data/TBP_WLS/TBP_kmers.txt",
              "../Data/diffUVProbes/TBP_25mers_cluster.tsv", "../Data/diffUVProbes/TBP_extShort_cluster.tsv")]


closeRELAList = []
for tup in clustTuple:
    kmerDF = pd.read_csv(tup[1], sep = '\t')
    clust25DF = pd.read_csv(tup[2], sep = '\t')
    clustShortDF = pd.read_csv(tup[3], sep = '\t')
    hDist = []
    for i in clust25DF["Sequence"]:
        hDist.append(closestSubHamming(i, "GGAAATTCCC"))
    clust25DF["hDist"] = hDist
    clust25DF = clust25DF.sort_values(by = "hDist").head(n = 50).reset_index(drop=True)
    desc = []
    for idx, hDist in enumerate(clust25DF["hDist"]):
        desc.append(f"{tup[0]}_H{hDist}_TD25_S{idx}")
    clust25DF["Description"] = desc
    clust25DF = clust25DF[["Sequence", "Description"]]
    #clust25DF = addReps(clust25DF, 12)
    closeRELAList.append(clust25DF)
    hDist = []
    for i in clustShortDF["Sequence"]:
        hDist.append(closestSubHamming(i, "GGAAATTCCC"))
    clustShortDF["hDist"] = hDist
    clustShortDF = clustShortDF.sort_values(by = "hDist").head(n = 50).reset_index(drop=True)
    desc = []
    for idx, hDist in enumerate(clustShortDF["hDist"]):
        desc.append(f"{tup[0]}_H{hDist}_TDEXT_S{idx}")
    clustShortDF["Description"] = desc
    clustShortDF = clustShortDF[["Sequence", "Description"]]
    #clustShortDF = addReps(clustShortDF, 12)
    closeRELAList.append(clustShortDF)
    
closeRelaProbeDF = pd.concat(closeRELAList)
closeRelaProbeDF = closeRelaProbeDF.reset_index(drop = True)
closeRelaProbeRCDF = revCompDF(closeRelaProbeDF)
closeRelaProbeDF["Sequence"] = closeRelaProbeDF["Sequence"].apply(lambda x: x + primer)
closeRelaProbeRCDF["Sequence"] = closeRelaProbeRCDF["Sequence"].apply(lambda x: x + primer)
closeRelaProbeDF = addReps(closeRelaProbeDF, 6)
closeRelaProbeRCDF = addReps(closeRelaProbeRCDF, 6)
closeRelaProbeDF

Unnamed: 0,Sequence,Description
0,GGAAATCCCTTAAACCAAATCATTACGCACACATACACATACACAC...,EGR1_H2_TD25_S0_r0
1,GGAAATCCCTTAAACCAAATCATTACGCACACATACACATACACAC...,EGR1_H2_TD25_S0_r1
2,GGAAATCCCTTAAACCAAATCATTACGCACACATACACATACACAC...,EGR1_H2_TD25_S0_r2
3,GGAAATCCCTTAAACCAAATCATTACGCACACATACACATACACAC...,EGR1_H2_TD25_S0_r3
4,GGAAATCCCTTAAACCAAATCATTACGCACACATACACATACACAC...,EGR1_H2_TD25_S0_r4
5,GGAAATCCCTTAAACCAAATCATTACGCACACATACACATACACAC...,EGR1_H2_TD25_S0_r5
6,GGAAATTCAAATCCCTTAAACCAAACGCACACATACACATACACAC...,EGR1_H2_TD25_S1_r0
7,GGAAATTCAAATCCCTTAAACCAAACGCACACATACACATACACAC...,EGR1_H2_TD25_S1_r1
8,GGAAATTCAAATCCCTTAAACCAAACGCACACATACACATACACAC...,EGR1_H2_TD25_S1_r2
9,GGAAATTCAAATCCCTTAAACCAAACGCACACATACACATACACAC...,EGR1_H2_TD25_S1_r3


### Binding sites across the probe length

Probes that test the effect of the binding site at different positions in the probe. 

In [9]:

posSeqList = [("TBP", "GTATATATAC"), 
              ("MITF", "GCACGTGC"), 
              ("EGR1_O1","ACGCACACGCGT"),
              ("EGR1_O2","ACGCGTGTGCGT"),
              ("CREB1", "GTGACGTCAC")]
def positionalProbes(desc, insert, baseSeq):
    probeList, descList = [], []
    for i in range(int(((len(baseSeq) - len(insert) + 1)))):
        if i % 2 == 0:
            probeList.append(baseSeq[:i] + insert + baseSeq[i + len(insert):])
    for i in range(len(probeList)):
        descList.append(f"{desc}_Pos{((i+1) * 2) -1}")
    # Append shortened probe with 0 position to list
    probeList.append(insert[1:] + baseSeq[len(insert):])
    descList.append(f"{desc}_Pos0_Short")
    return(pd.DataFrame({"Sequence":probeList, "Description":descList}))
positionalProbeDFList = []
for i in posSeqList:
    positionalProbeDFList.append(positionalProbes(i[0], i[1], "GTATGTACGCACGTGCGTACATACG"))
posProbeDF = pd.concat(positionalProbeDFList)
posProbeDF = posProbeDF.reset_index(drop=True)
posProbeDF = addReps(posProbeDF, 20)
posProbeDF["Sequence"] = posProbeDF["Sequence"].apply(lambda x: x + primer)
posProbeDF

Unnamed: 0,Sequence,Description
0,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r0
1,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r1
2,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r2
3,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r3
4,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r4
5,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r5
6,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r6
7,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r7
8,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r8
9,GTATATATACACGTGCGTACATACGCGCACACATACACATACACAC...,TBP_Pos1_r9


### Combine DFs into UV Array Design

Concatinate DFs of UV array probes. Format DF to a file that can be submitted to Agilent. 

In [13]:
uvArray = pd.concat([diMutDF,flankDF,deBruDF,kDDF,
                     clusterProbeDF,clusterProbeRCDF,closeRelaProbeDF,closeRelaProbeRCDF,
                     posProbeDF])
# Check that all sequences are the right length and have the primer
for seq, desc in zip(uvArray["Sequence"], uvArray["Description"]):
    if len(seq) != 60 and "Short" not in desc:
        raise ValueError(f"Array Length is not correct, len = {len(i)}\n{i}")
    if len(seq) != 59 and "Short" in desc:
        raise ValueError(f"Array Length is not correct, len = {len(i)}\n{i}")        
    if primer not in seq:
        raise ValueError(f"Primer not found in sequence: {i}")
    if "GGGGGGGGGG" in seq:
        raise ValueError(f"G repeat in sequence: {i}")   
# Generate a copy of the array without replicates
uvArrayUnique = uvArray.copy(deep = True)
uvArrayUnique["Description"] = uvArrayUnique["Description"].apply(lambda x: removeRepString(x))
uvArrayUnique = uvArrayUnique.drop_duplicates()
uvArrayUnique.to_csv('../Array_Probes.txt', sep = '\t', header = None, index = False)
# Generate UV Array file formated for ordering form Agilent
ID = []
for idx, row in enumerate(uvArray['Sequence']):
    ID.append(f"Ctrl_UVT_{str(idx).zfill(6)}")
uvArray['ID'] = ID
uvArray = uvArray[['ID', 'Sequence', 'Description']]
uvArray['NA'] = 'NA|NA'
uvArray['NA2'] = 'NA'
uvArray['Desc2'] = uvArray['Description']
uvArray['chr'] = 'chr1:0-0'
uvArray.to_csv("../Array_Design_Full.txt", sep = '\t', header = None, index = False)
uvArray

Unnamed: 0,ID,Sequence,Description,NA,NA2,Desc2,chr
0,Ctrl_UVT_000000,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r0,NA|NA,,DN_TBP_TATATATA_pos5->AA_r0,chr1:0-0
1,Ctrl_UVT_000001,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r1,NA|NA,,DN_TBP_TATATATA_pos5->AA_r1,chr1:0-0
2,Ctrl_UVT_000002,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r2,NA|NA,,DN_TBP_TATATATA_pos5->AA_r2,chr1:0-0
3,Ctrl_UVT_000003,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r3,NA|NA,,DN_TBP_TATATATA_pos5->AA_r3,chr1:0-0
4,Ctrl_UVT_000004,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r4,NA|NA,,DN_TBP_TATATATA_pos5->AA_r4,chr1:0-0
5,Ctrl_UVT_000005,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r5,NA|NA,,DN_TBP_TATATATA_pos5->AA_r5,chr1:0-0
6,Ctrl_UVT_000006,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r6,NA|NA,,DN_TBP_TATATATA_pos5->AA_r6,chr1:0-0
7,Ctrl_UVT_000007,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r7,NA|NA,,DN_TBP_TATATATA_pos5->AA_r7,chr1:0-0
8,Ctrl_UVT_000008,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r8,NA|NA,,DN_TBP_TATATATA_pos5->AA_r8,chr1:0-0
9,Ctrl_UVT_000009,GTACGAACGTATATATACGTACGTACGCACACATACACATACACAC...,DN_TBP_TATATATA_pos5->AA_r9,NA|NA,,DN_TBP_TATATATA_pos5->AA_r9,chr1:0-0


In [12]:
def removeRepString(string):
    strList = string.split("_")[:-1]
    return("_".join(strList))
removeRepString("DN_TBP_TATATATA_pos5->AA_r0")

'DN_TBP_TATATATA_pos5->AA'