In [9]:
%matplotlib inline
# Import Modules
import pandas as pd
import numpy as np
from scipy import stats as ss
from statsmodels.stats.multitest import multipletests
from bokeh.plotting import figure, output_file, show, ColumnDataSource, output_file, save
from bokeh.models import HoverTool, Range1d, LabelSet, Label



#TBP Settings

folderWT = "/Users/zmielko/Desktop/PhD/PhD_Paper1/UV_Project_PBMdata/aTBP/TBPar"
folderUV = "/Users/zmielko/Desktop/PhD/PhD_Paper1/UV_Project_PBMdata/aTBP/UV_TBPar"

wtKmers = pd.read_csv(f"{folderWT}/TBPar_7mers_1111111.txt", 
                     sep = '\t')
uvKmers = pd.read_csv(f"{folderUV}/UV_TBPar_7mers_1111111.txt", 
                     sep = '\t')
# WLSR data
wls = pd.read_csv("/Users/zmielko/Desktop/WLSR_Code/Escore_6_8_vs_Escore_6_9_UV_CI_0.99.txt", sep = '\t')
kwls = pd.read_csv("/Users/zmielko/Desktop/WLSR_Code/EGR_kmers.txt", sep = '\t')
wls["kmer"] = kwls["kmerFwd"]
wls["kmerR"] = kwls["kmerRC"]
highBind = wls.query("pref < 0")
kDictWL = dict(zip(highBind["kmer"], highBind["pref"])) 
kDictWL.update(dict(zip(highBind["kmerR"], highBind["pref"])))

sortedUV = list(highBind.sort_values(by = "Escore_6_9_UV", ascending = False)["kmer"])
# Dict for all kmers
k = 7
kDictWT = dict(zip(wtKmers["7-mer"], wtKmers["E-score"])) 
kDictWT.update(dict(zip(wtKmers["7-mer.1"], wtKmers["E-score"])))
kDictUV = dict(zip(uvKmers["7-mer"], uvKmers["E-score"])) 
kDictUV.update(dict(zip(uvKmers["7-mer.1"], uvKmers["E-score"])))


def scorePos(seq, kDict, k = k):
    posList, scoreList = [], []
    for i in range(len(seq) - k + 1):
        scoreList.append(kDict[seq[i:i + k]])
        posList.append(i)
    scoreDF = pd.DataFrame({"Position":posList, "Score":scoreList})
    return(scoreDF)
def plotScores(qSeq, kDictWT = kDictWT, kDictUV = kDictUV):
    scoreDFWT = scorePos(qSeq, kDictWT)
    scoreDFUV = scorePos(qSeq, kDictUV)
    # Plotting
    barHeight = 7
    alphaVal = .8
    p = figure(plot_width=800, plot_height=600, y_range = (-0.55, 0.55))
    p.xaxis.major_label_text_font_size = "15pt"
    p.yaxis.major_label_text_font_size = "15pt"
    p.yaxis.axis_label_text_font_size = "25pt"
    p.xaxis.axis_label_text_font_size = "25pt"
    p.yaxis.axis_label = f"E-score"
    p.xaxis.axis_label = f"K-mer Position"
    sourceWT = ColumnDataSource(scoreDFWT)
    sourceUV = ColumnDataSource(scoreDFUV)
    p.line(x="Position", y="Score",color="#0B0055",line_width=6,
                source = sourceWT,alpha = alphaVal)
    p.line(x="Position", y="Score",color="#F86302",line_width=6,
                source = sourceUV,alpha = alphaVal)
    show(p)
    
def extRight(seq, k, prefSet, seenSet):
    queryStart = len(seq) - k + 1
    querySeq = seq[queryStart:]
    canExt = 0
    for i in ['A', 'C', 'G', 'T']:
        if querySeq + i in prefSet and querySeq + i not in seenSet:
            canExt = 1
            seenSet.add(querySeq + i)
            extRight(seq + i, k, prefSet, seenSet)
    if canExt == 0:
        genSeqList.append(seq)
def extLeft(seq, k, prefSet, seenSet):
    querySeq = seq[:k-1]
    canExt = 0
    for i in ['A', 'C', 'G', 'T']:
        if i + querySeq in prefSet and i + querySeq not in seenSet:
            canExt = 1
            seenSet.add(i + querySeq)
            extLeft(i + seq, k, prefSet, seenSet)
    if canExt == 0:
        genSeqLeftList.append(seq)

prefSet = set(list(highBind["kmer"]) + list(highBind["kmerR"]))
totalProbes = set()
for i in sortedUV:
    genSeqList = []
    ss = set()
    extRight(i, 6, prefSet = prefSet, seenSet = ss)
    genSeqLeftList = []
    for j in genSeqList:
        extLeft(i, 6, prefSet = prefSet, seenSet = ss)
    totalProbes.update(genSeqLeftList)

totalProbesList = list(totalProbes)
len(totalProbes)
def uniqueKmer(seq, k):
    kSet = set()
    for i in range(len(seq) - k + 1):
        kSet.add(seq[i:i+k])
    return(kSet)
allSet = set()
shortProbes = []
noextension = []
for i in totalProbesList:
    iLen = len(i)
    if iLen >= 25:
        allSet.update(uniqueKmer(i, 25))
    elif iLen == 6:
        noextension.append(i)
    else:
        shortProbes.append(i)
len(allSet)

# Gen Scores

prefScoreDF = wls.query("pref < 0").copy(deep = True)
prefScoreDF["Diff"] = prefScoreDF["Escore_6_9_UV"] - prefScoreDF["Escore_6_8"]
prefScoreDF["pref"] = prefScoreDF["pref"] * -1
kDictScorePref = dict(zip(prefScoreDF["kmer"], prefScoreDF["pref"])) 
kDictScorePref.update(dict(zip(prefScoreDF["kmerR"], prefScoreDF["pref"])))
kDictScoreDiff = dict(zip(prefScoreDF["kmer"], prefScoreDF["Diff"])) 
kDictScoreDiff.update(dict(zip(prefScoreDF["kmerR"], prefScoreDF["Diff"])))
def sumScore(seq, k, kScoreDict):
    score = 0
    for i in range(len(seq) - k + 1):
        score = score + kScoreDict[seq[i:i+k]]
    return(score)

allList = list(allSet)
sList = []
for i in allList:
    sList.append(sumScore(i, 6, kDictScoreDiff))
scoredProbes = pd.DataFrame({"probe":allList, "score":sList})
list(scoredProbes.sort_values(by = "score", ascending = False).reset_index(drop=True)["probe"][:100])

['AGGGTATTGGGGGATATCCCCCATA',
 'TGGGTATTGGGGGATATCCCCCATA',
 'GGGGTATTGGGGGATATCCCCCATA',
 'CCGAATGGGGGATATCCCCCATATG',
 'GGGTATTGGGGGATATCCCCCATAT',
 'ATGGGTATTGGGGGATATCCCCCAT',
 'TAGGGTATTGGGGGATATCCCCCAT',
 'TATGGGGGATATCCCCCATATGAGG',
 'TTAGGGTATTGGGGGATATCCCCCA',
 'GTTAGGGTATTGGGGGATATCCCCC',
 'TATCCCCCAATACCCATATGGGTTA',
 'GATATGGGGGATATCCCCCATATGA',
 'TGATATGGGGGATATCCCCCATATG',
 'TGGGGTATTGGGGGATATCCCCCAT',
 'TCGAATGGGGGATATCCCCCATATG',
 'ACGAATGGGGGATATCCCCCATATG',
 'CTGATATGGGGGATATCCCCCATAT',
 'TTGGGGGATATCCCCCATATGAGGT',
 'CGAATGGGGGATATCCCCCATATGA',
 'TTTGGGGGATATCCCCCATATGAGG',
 'ATTGGGGGATATCCCCCATATGAGG',
 'AATGGGGGATATCCCCCATATGAGG',
 'ATATGGGGGATATCCCCCATATGAG',
 'ATGGGGGATATCCCCCATATGAGGT',
 'CTGGTATTGGGGGATATCCCCCATA',
 'AATCCCCCAATACCCATATGGGTTA',
 'TGGTATTGGGGGATATCCCCCATAT',
 'TGGGGGATATCCCCCATATGAGGTA',
 'GAATGGGGGATATCCCCCATATGAG',
 'GGGGGATATCCCCCATATGAGGTAA',
 'GGTATTGGGGGATATCCCCCATATG',
 'GTATTGGGGGATATCCCCCATATGA',
 'ATGGGATATCCCCCATATGAGGTAA',
 'GTTACCCC

In [15]:
plotScores("GTAACATCAGGTATACCTGATGTTA")

In [10]:
import numpy as np
from sklearn.cluster import AffinityPropagation
import distance

inputList = list(scoredProbes.sort_values(by = "score", ascending = False).reset_index(drop=True)["probe"][:100])
words = np.asarray(inputList) #So that indexing with a list will work
lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
affprop.fit(lev_similarity)


AffinityPropagation(affinity='precomputed', convergence_iter=15, copy=True,
                    damping=0.5, max_iter=200, preference=None, verbose=False)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [12]:
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

 - *TGGGTATTGGGGGATATCCCCCATA:* AGGGTATTGGGGGATATCCCCCATA, ATGGGTATTGGGGGATATCCCCCAT, CTGGTATTGGGGGATATCCCCCATA, GGGGTATTGGGGGATATCCCCCATA, GGGTATTGGGGGATATCCCCCATAT, GGTATTGGGGGATATCCCCCATATG, GTTAGGGTATTGGGGGATATCCCCC, TAGGGTATTGGGGGATATCCCCCAT, TGGGGTATTGGGGGATATCCCCCAT, TGGGTATTGGGGGATATCCCCCATA, TGGTATTGGGGGATATCCCCCATAT, TTAGGGTATTGGGGGATATCCCCCA
 - *CGAATGGGGGATATCCCCCATATGA:* ACGAATGGGGGATATCCCCCATATG, ATATGGGGGATATCCCCCATATGAG, CCGAATGGGGGATATCCCCCATATG, CGAATGGGGGATATCCCCCATATGA, CTGATATGGGGGATATCCCCCATAT, GAATGGGGGATATCCCCCATATGAG, GATATGGGGGATATCCCCCATATGA, GTATTGGGGGATATCCCCCATATGA, TATTGGGGGATATCCCCCATATGAG, TCGAATGGGGGATATCCCCCATATG, TGATATGGGGGATATCCCCCATATG
 - *ATGGGGGATATCCCCCATATGAGGT:* AATGGGGGATATCCCCCATATGAGG, AGACCTCATATCCCCCATATGAGGT, ATGGGGGATATCCCCCATATGAGGT, ATTGGGGGATATCCCCCATATGAGG, TATGGGGGATATCCCCCATATGAGG, TGGGGGATATCCCCCATATGAGGTA, TTGGGGGATATCCCCCATATGAGGT, TTTGGGGGATATCCCCCATATGAGG
 - *GGGGATATCCCCCATATGAGGTAAC:* ACCGATATCCCCCATATGAGGTAAC, ATGGATATCCC