These are common functions used in bioinformatics. Most of these fuctions minimize the use of external libraries. 

These are NOT the most effecient implementations, the goal here is for simplicity 

In [121]:
# Sliding Window Approach to counting how many times a substring occurs in the primary string
s1 = "CGCGATACGTTACATACATGATAGACCGCGCGCGATCATATCGCGATTATC"
t1 = "CGCG"
def sliding_window(s: str, t: str):
    c=0
    for i in range(len(s)):
        if s[i:i+len(t)] == t:
            c+=1
        else:
            pass
    return c
sliding_window(s1, t1)

5

In [67]:
#Finds Kmers and returns a dict showing frequency of kmers occuring
def frequent_words(s: str, k: int):
    patterns = {}
    for i in range(len(s)):
        if len(s[i:i+k]) == k:
            if s[i:i+k] in patterns.keys():
                patterns[s[i:i+k]] = 1 + patterns[s[i:i+k]]
            else:
                patterns[s[i:i+k]]=1
    return patterns
# Finds the most frequent kmers and returns them
def most_common(patterns: dict):
    highest = []
    h = 0
    for i in patterns.keys():
        if patterns[i] > h:
            h = patterns[i]
    for i in patterns.keys():
        if patterns[i] == h:
            highest.append(i)
    t = ""
    for i in highest:
        t+=i
        t+=" "
    print(t)

s1 = "TAAACGTGAGAGAAACGTGCTGATTACACTTGTTCGTGTGGTAT"
k1 = 3
most_common(frequent_words(s1, k1))
        

GTG 


In [68]:
# Finds the reverse compliment of a sequence containing only ACTG
def reverse_compliment(s: str):
    nuc = {"A":"T", "C":"G", "T":"A", "G":"C"}
    ns = ""
    for i in s:
        ns+=nuc[i]
    ns = ns[::-1]
    return ns
s1 = "GCTAGCT"
reverse_compliment(s1)

'AGCTAGC'

In [124]:
# This is a simple but very slow solution
def match_pattern(s: str, p: str):
    spots = []
    for i in range(len(s)):
        if(s[i:i+len(p)]) == p:
            spots.append(i)
    return spots
s1 = "ATGACTTCGCTGTTACGCGC"
p1="CGC"

print(match_pattern(s1, p1))
# Better way is to use regex, must faster however this version doesn't handle overlaps which can miss some spots
import re
t = [m.start() for m in re.finditer(p1, s1)]
print(t)
# Best was is using regex with look ahead
# Using the folling string modification allows us to 
p1 = "(?="+p1+")"


tt = [m.start() for m in re.finditer(p1, s1)]
print(tt)


# Credits https://stackoverflow.com/questions/4664850/find-all-occurrences-of-a-substring-in-python

[7, 15, 17]
[7, 15]
[7, 15, 17]


In [125]:
from collections import defaultdict

def search(s: str, k: int, L: int, t: int):
    lookup = defaultdict(list)
    result = set()

    for pattern in range(len(s) - k + 1):
        seg = s[pattern:pattern + k]

        # remove prior positions of the same segment
        # if they are more than L distance far
        while lookup[seg] and pattern + k - lookup[seg][0] > L:
            lookup[seg].pop(0)

        lookup[seg].append(pattern)
        if len(lookup[seg]) == t:
            result.add(seg)
    return result
s1 = "AAAACGTCGAAAAA"
k = 2
L = 4
t = 2
print(search(s1, k, L, t))


{'AA'}


In [66]:
# Converts sequences stored as ints into sequences, k represent the length of the desired sequence
def number_to_pattern(n: int, k: int):
    t = ""
    nuc = {"0":"A", "1":"C", "2":"G", "3":"T"}
    for i in range(k):
        t+=nuc[str(n%4)]
        n = n//4
    return t[::-1]

print(number_to_pattern(5537, 8))

ACCCGGAC


In [62]:
# Converts sequence to int
def pattern_to_number(s: str):
    nuc = {"A":0, "C":1, "G":2, "T":3}
    p = 0
    for i in range(len(s)):
        p = p + (nuc[s[i]]*(4**(len(s)-(i+1))))
    return p
pattern_to_number("GGATCTAAGTTAGTTTG")

10968298238

In [116]:
# Find the frequency patterns in a seq, this implementation works on short sequences but has issues on larger sequences  
def compute_freq(s: str, k: int):
    freq = [0] * (4**k)
    for i in range(len(s)-1):
        freq[pattern_to_number(s[i:i+k])] = freq[pattern_to_number(s[i:i+k])] + 1
    s1 = ""
    for i in freq:
        s1+=str(i)
        s1+=" "
    return s1


In [3]:
def skew(seq: str):
    k = 0
    s = [0]
    nuc = {"C":-1, "A": 0, "T": 0, "G": 1}
    for i in seq:
        s.append(k+nuc[i])
        k = k+nuc[i]
    return s
skew("CATGGGCATCGGCCATACGCC")


[0, -1, -1, -1, 0, 1, 2, 1, 1, 1, 0, 1, 2, 1, 0, 0, 0, 0, -1, 0, -1, -2]

In [4]:
def minimum_skew(seq: str):
    skews = skew(seq)
    mins = min(skews)
    return [i for i in range(len(skews)) if skews[i] == mins]

[74249, 74249]