In [1]:
import random 
import math
import numpy as np
import matplotlib.pyplot as plt
from Bio.Seq import Seq

In [2]:
def sampleSeq(length):
    DNA = ""
    for index in range(length):
        DNA += random.choice("ATGC")
    return DNA

def countACTG(seq):
    aCount = 0
    cCount = 0
    tCount = 0
    gCount = 0

    for bp in seq:
        if bp == "A":
            aCount = aCount + 1
        elif bp == "C":
            cCount = cCount + 1
        elif bp == "T":
            tCount = tCount + 1
        elif bp == "G":
            gCount = gCount + 1

    sequenceLength = len(seq)
    
    if (gCount+cCount)/sequenceLength >= 0.5 and (gCount+cCount)/sequenceLength <= 0.55 and cCount/sequenceLength >= 0.2 and cCount/sequenceLength <= 0.3 and aCount/sequenceLength <= 0.3:
        return True
    else:
        return False

def longruns(newSeq):
    if ("AAAA" in newSeq) or ("CCCC" in newSeq) or ("TTTTT" in newSeq) or ("GGGGG" in newSeq):
        return False
    else: 
        return True 

def slideComparison(seq1, seq2):
    maxOverlap = 0 
    #print(seq1)
    #print(seq2)
    for x in range(2*len(seq1)-1):
        overlap = 0
        #comparing from the start of seq1 and the end of seq2
        if x < len(seq1):  
            temp1 = seq1[0:x+1]
            temp2 = seq2[(-1-x):]
        #now sliding seq2 forward against seq1 after they're lined up
        else:
            temp1 = seq1[x-(len(seq1)-1):]
            temp2 = seq2[0:(len(seq2)-(x-(len(seq2)-1)))]
        #each alignment stored in temps to check whether overlap is greeatre than the current max
        for y in range(len(temp1)):
            if temp1[y] == temp2[y]:
                overlap += 1
        if overlap > maxOverlap:
            maxOverlap = overlap
    return maxOverlap

def slideComparison_debug(seq1, seq2):
    """
    """  
    maxOverlap = 0 
    maxSlide = 0 
    #print(seq1)
    #print(seq2)
    for x in range(2*len(seq1)-1):
        overlap = 0
        #comparing from the start of seq1 and the end of seq2
        if x < len(seq1):  
            temp1 = seq1[0:x+1]
            temp2 = seq2[(-1-x):]
        #now sliding seq2 forward against seq1 after they're lined up
        else:
            temp1 = seq1[x-(len(seq1)-1):]
            temp2 = seq2[0:(len(seq2)-(x-(len(seq2)-1)))]
        #each alignment stored in temps to check whether overlap is greeatre than the current max
        for y in range(len(temp1)):
            if temp1[y] == temp2[y]:
                overlap += 1
        if overlap > maxOverlap:
            maxOverlap = overlap
            maxSlide = x
        
    return maxOverlap, maxSlide

def allPossComp(seq1, seq2):
    maxComp = slideComparison(seq1, seq2)
    
    seq1rev = str((Seq(seq1)).reverse_complement())
    seq2rev = str((Seq(seq2)).reverse_complement())

    f1r1 = slideComparison(seq1, seq1rev)
    r1f2 = slideComparison(seq1rev, seq2)
    f1r2 = slideComparison(seq1, seq2rev)
    
    if f1r1 > maxComp:
        maxComp = r1f2
    if r1f2> maxComp:
        maxComp = r1f2
    if f1r2 > maxComp:
        maxComp = f1r2
        
    return maxComp

def allPossComp_debug(seq1, seq2):
    maxComp = slideComparison(seq1, seq2)
    seqtypes = "f1f2"
    
    seq1rev = str((Seq(seq1)).reverse_complement())
    seq2rev = str((Seq(seq2)).reverse_complement())

    # f1r1 = slideComparison(seq1, seq1rev)
    r1f2 = slideComparison(seq1rev, seq2)
    f1r2 = slideComparison(seq1, seq2rev)
    r1r2 = slideComparison(seq1rev, seq2rev)
    
    # if f1r1 > maxComp:
    #     maxComp = f1r1
    #     seqtypes = "f1r1"
    if r1f2 > maxComp:
        maxComp = r1f2
        seqtypes = "r1f2"
    if f1r2 > maxComp:
        maxComp = f1r2
        seqtypes = "f1r2"
    if r1r2 > maxComp:
        maxComp = r1r2
        seqtypes = "r1r2"
        
    return maxComp, seqtypes

def makeMatrix(seqList):
    matrix = np.zeros((len(seqList), len(seqList)))
    for curSeq in range(len(seqList)):
        for seq in range(len(seqList)):
            if seq > curSeq:
                matrix[curSeq][seq] = allPossComp(seqList[curSeq], seqList[seq])
                
    return(matrix)

In [3]:
def elimination(numSeq, length=20, length_overlap=10):
    initial = sampleSeq(length)
    while ((countACTG(initial) and longruns(initial))) == False:
        initial = sampleSeq(length)
    fullSeqList = [initial]
    
    while len(fullSeqList) < numSeq:
        # print(len(fullSeqList))
        
        # generate random sequences
        curSeq = sampleSeq(length)
        # make sure they pass the ACTG and longruns thresholds 
        while ((countACTG(curSeq) and longruns(curSeq))) == False:
            curSeq = sampleSeq(length)
            
        # make sure they don't cross hybridize
        qualified = True # qualified
        overlaps = []
        for indivSeq in range(len(fullSeqList)):
            overlap = allPossComp(curSeq, fullSeqList[indivSeq])
            overlap_shadow = allPossComp(fullSeqList[indivSeq], curSeq)
            
            if overlap != overlap_shadow:
                print(overlap, overlap_shadow)
                print(fullSeqList[indivSeq], curSeq)
                raise ValueError("something wrong")
                # break
            
            overlaps.append(overlap)
            
            if overlap >= length_overlap: 
                qualified = False  # not 
                # continue
            
        if qualified: 
            print(overlaps)
            fullSeqList.append(curSeq)
            
    return fullSeqList

In [4]:
def plot_mat(mat1):
    fig, ax = plt.subplots()
    minnum = np.min(mat1[mat1!=0])
    g = ax.imshow(mat1, cmap='summer', vmin=minnum)
    fig.colorbar(g)
    plt.show()

In [5]:


# 
num = 10
length = 20
length_overlap = 8


elimList = elimination(num, length=length, length_overlap=length_overlap)
mat2 = makeMatrix(elimList)
plot_mat(mat2)

print(np.max(mat2))
# print(elimList)

[7]
7 8
GACGATTGGCACTCAGACCT ACAGTAGCACGGTGGTCATA


ValueError: something wrong

In [6]:
seq1 = "GTCCTTAGGTCCGTCCTGTT"
seq2 = "CTACAGACACTGTCGTGTTC"
allPossComp(seq1, seq2)

8

In [7]:
allPossComp(seq2, seq1)

6