These are common functions used in bioinformatics. Most of these fuctions minimize the use of external libraries. 

I have included links to solutions that were developed by other individuals. Hopefully this can be of use. 

In [None]:
# Sliding Window Approach to counting how many times a substring occurs in the primary string
s1 = "CGCGATACGTTACATACATGATAGACCGCGCGCGATCATATCGCGATTATC"
t1 = "CGCG"
def sliding_window(s: str, t: str):
    c=0
    for i in range(len(s)):
        if s[i:i+len(t)] == t:
            c+=1
        else:
            pass
    return c
sliding_window(s1, t1)

In [None]:
#Finds Kmers and returns a dict showing frequency of kmers occuring
def frequent_words(s: str, k: int):
    patterns = {}
    for i in range(len(s)):
        if len(s[i:i+k]) == k:
            if s[i:i+k] in patterns.keys():
                patterns[s[i:i+k]] = 1 + patterns[s[i:i+k]]
            else:
                patterns[s[i:i+k]]=1
    return patterns
# Finds the most frequent kmers and returns them
def most_common(patterns: dict):
    highest = []
    h = 0
    for i in patterns.keys():
        if patterns[i] > h:
            h = patterns[i]
    for i in patterns.keys():
        if patterns[i] == h:
            highest.append(i)
    t = ""
    for i in highest:
        t+=i
        t+=" "
    print(t)
    return t.split(" ")

s1 = "TAAACGTGAGAGAAACGTGCTGATTACACTTGTTCGTGTGGTAT"
k1 = 3
most_common(frequent_words(s1, k1))
        

In [None]:
# Finds the reverse compliment of a sequence containing only ACTG
def reverse_compliment(s: str):
    nuc = {"A":"T", "C":"G", "T":"A", "G":"C"}
    ns = ""
    for i in s:
        ns+=nuc[i]
    ns = ns[::-1]
    return ns
s1 = "GCTAGCT"
reverse_compliment(s1)

In [None]:
# This is a simple but very slow solution
def match_pattern(s: str, p: str):
    spots = []
    for i in range(len(s)):
        if(s[i:i+len(p)]) == p:
            spots.append(i)
    return spots
s1 = "ATGACTTCGCTGTTACGCGC"
p1="CGC"

print(match_pattern(s1, p1))
# Better way is to use regex, must faster however this version doesn't handle overlaps which can miss some spots
import re
t = [m.start() for m in re.finditer(p1, s1)]
print(t)
# Best was is using regex with look ahead
# Using the folling string modification allows us to 
p1 = "(?="+p1+")"


tt = [m.start() for m in re.finditer(p1, s1)]
print(tt)


# Credits https://stackoverflow.com/questions/4664850/find-all-occurrences-of-a-substring-in-python

In [None]:
from collections import defaultdict

def search(s: str, k: int, L: int, t: int):
    lookup = defaultdict(list)
    result = set()

    for pattern in range(len(s) - k + 1):
        seg = s[pattern:pattern + k]

        # remove prior positions of the same segment
        # if they are more than L distance far
        while lookup[seg] and pattern + k - lookup[seg][0] > L:
            lookup[seg].pop(0)

        lookup[seg].append(pattern)
        if len(lookup[seg]) == t:
            result.add(seg)
    return result
s1 = "AAAACGTCGAAAAA"
k = 2
L = 4
t = 2
print(search(s1, k, L, t))


In [None]:
# Converts sequences stored as ints into sequences, k represent the length of the desired sequence
def number_to_pattern(n: int, k: int):
    t = ""
    nuc = {"0":"A", "1":"C", "2":"G", "3":"T"}
    for i in range(k):
        t+=nuc[str(n%4)]
        n = n//4
    return t[::-1]

print(number_to_pattern(5537, 8))

In [None]:
# Converts sequence to int
def pattern_to_number(s: str):
    nuc = {"A":0, "C":1, "G":2, "T":3}
    p = 0
    for i in range(len(s)):
        p = p + (nuc[s[i]]*(4**(len(s)-(i+1))))
    return p
pattern_to_number("GGATCTAAGTTAGTTTG")

In [None]:
# Find the frequency patterns in a seq, this implementation works on short sequences but has issues on larger sequences  
def compute_freq(s: str, k: int):
    freq = [0] * (4**k)
    for i in range(len(s)-1):
        freq[pattern_to_number(s[i:i+k])] = freq[pattern_to_number(s[i:i+k])] + 1
    s1 = ""
    for i in freq:
        s1+=str(i)
        s1+=" "
    return s1


In [None]:
def skew(seq: str):
    k = 0
    s = [0]
    nuc = {"C":-1, "A": 0, "T": 0, "G": 1}
    for i in seq:
        s.append(k+nuc[i])
        k = k+nuc[i]
    return s
skew("CATGGGCATCGGCCATACGCC")


In [None]:
def minimum_skew(seq: str):
    skews = skew(seq)
    mins = min(skews)
    t = [i for i in range(len(skews)) if skews[i] == mins]
    print(t)
minimum_skew("GATACACTTCCCGAGTAGGTACTG")

def max_skew(seq: str):
    skews = skew(seq)
    maxs = max(skews)
    return [i for i in range(len(skews)) if skews[i] == maxs]
#max_skew("CATTCCAGTACTTCATGATGGCGTGAAGA")

In [None]:
def hamming_distance(seq1: str, seq2: str):
    #https://pythonadventures.wordpress.com/2010/10/19/hamming-distance/
    if len(seq1) == len(seq2):
        return int(sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2)))
    else:
        return -1
        
s1 = "TGACCCGTTATGCTCGAGTTCGGTCAGAGCGTCATTGCGAGTAGTCGTTTGCTTTCTCAAACTCC"
s2 = "GAGCGATTAAGCGTGACAGCCCCAGGGAACCCACAAAACGTGATCGCAGTCCATCCGATCATACA"
hamming_distance(s1, s2)
    

In [None]:
def pattern_occurences(seq: str, pattern: str, hamming: int):
    c = []
    for i in range(0,len(seq)):
        t = hamming_distance(seq[i:i+len(pattern)], pattern) 
        if t <= hamming and t != -1:
            c.append(i)
    return c
    
s = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT"
p = "ATTCTGGA"
h = 3
pattern_occurences(s,p,h)

In [None]:
import itertools
def mutations(word, hamming_distance, charset='ATCG'):
    # https://stackoverflow.com/questions/19941079/inverse-of-hamming-distance
    for indices in itertools.combinations(range(len(word)), hamming_distance):
        for replacements in itertools.product(charset, repeat=hamming_distance):
            mutation = list(word)
            for index, replacement in zip(indices, replacements):
                mutation[index] = replacement
            yield "".join(mutation)

def count(seq: str, pattern: str, h: int):
    c = 0
    for i in set(mutations(pattern, h)):
        pattern = "(?="+pattern+")"
        c+=len([m.start() for m in re.finditer(i, seq)])
    return c

s = "TACGCATTACAAAGCACA"
p = "AA"
h = 1
count(s, p, h)


In [None]:
def neighbors(s, h):
    c = []
    for i in set(mutations(s, h)):
        c.append(i)
    print(len(c))
neighbors("TGCAT", 2)

In [None]:
from collections import OrderedDict
from operator import itemgetter


def kmers_finder_with_mismatches(seq: str, k: int, h: int, most_common=False):
    # https://gist.github.com/alec-djinn/9018370
    motif_dict = {}
    for i in range(len(sequence) - motif_length +1):
        motif = sequence[i:i+motif_length]
        if motif not in motif_dict:
            motif_dict[motif] = 1
        else:
            motif_dict[motif] += 1
    #check for mismatches
    motif_dict_with_mismatches = {}
    for kmer in motif_dict:
        motif_dict_with_mismatches.update({kmer:[]})
            
        for other_kmer in motif_dict:
            mismatches = 0
            for i in range(len(kmer)):
                if kmer[i] != other_kmer[i]:
                    mismatches += 1
            if mismatches <= max_mismatches:
                motif_dict_with_mismatches[kmer].append([other_kmer,motif_dict[other_kmer]])
    #count occurrences of motifs
    tmp = {}
    for item in motif_dict_with_mismatches:
        count = 0
        for motif in motif_dict_with_mismatches[item]:
            count += motif[-1]
        tmp.update({item:count})

    result = OrderedDict(sorted(tmp.items(), key=itemgetter(1), reverse=True))
    #find the most common/s
    if most_common:
        commons = OrderedDict()
        _max = result.items()[0][1]
        for item in result:
            if result[item] == _max:
                commons.update({item:result[item]})
            else:
                return commons
    return result

sequence = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'

motif_length = 4
max_mismatches = 1
a = kmers_finder_with_mismatches(sequence, motif_length, max_mismatches, most_common=False)
print(a)


In [None]:
# https://stackoverflow.com/questions/45802748/dna-motif-enumeration-with-try-except-and-loops-python3
# This stack overflow posts shows some really powerfull ways to solve basic problems using some python optimizations
def combination(k):
    return (''.join(p) for p in itertools.product('ATCG', repeat=k))

def hamming_distance(pattern, seq):
    return sum(c1 != c2 for c1, c2 in zip(pattern, seq))

def window(s, k):
    for i in range(1 + len(s) - k):
        yield s[i:i+k]

def motif_enumeration(seq: str, k: int, d: int):
    pattern = set()
    for combo in combination(k):
        if all(any(hamming_distance(combo, pat) <= d 
                for pat in window(string, k)) for string in seq):
            pattern.add(combo)
    return pattern
            
        
s =["AAGAAGCTTAGCCATTCGAAACACC", "GAGCGGTTGCGGCATGAAATTTTCA", "CCTAAGCCATCATCCAGTTCAATGA", "AGGTTGAACGGGATTGCCATATGCT", "TGTCTTCCCTATTTTGCCGCGACAT", "GGAAAGCCTAGTCATGCTCAATCGA"]
k = 5
d = 1
motif_enumeration(s, k, d)

In [None]:
# https://github.com/jarecot/Rosalind/blob/master/Textbook_03B.py
from itertools import product
def median_string(k: int, seqs: list):
    best_score = k*len(seqs) + 1
    for pattern in product('ACGT', repeat=k):
        current_score = sum([motif_score(''.join(pattern), seq) for seq in seqs])
        if current_score < best_score:
            best_score = current_score
            best_pattern = ''.join(pattern)
    return best_pattern

def motif_score(pattern, motif):
    return min([hamming_distance(motif[i:i+len(pattern)], pattern) for i in range(len(motif)-len(pattern)+1)])
seqs = ["CTCGATGAGTAGGAAAGTAGTTTCACTGGGCGAACCACCCCGGCGCTAATCCTAGTGCCC", "GCAATCCTACCCGAGGCCACATATCAGTAGGAACTAGAACCACCACGGGTGGCTAGTTTC","GGTGTTGAACCACGGGGTTAGTTTCATCTATTGTAGGAATCGGCTTCAAATCCTACACAG"]
k = 7
median_string(k, seqs)

In [None]:
from operator import mul
from functools import reduce


#https://github.com/minw2828/Coursera---Bioinformatics-Algorithms/blob/master/chapter3/C3_39/39_3/pmpkp.py
from functools import reduce
import operator

s = "ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT"
k = 5
p = [[.2, .2, .3, .2, .3], [.4, .3, .1, .5, .1], [.3, .3, .5, .2, .4], [.1, .2, .1, .1, .2]]
def find_kmers(seq, k):
    return set(seq[i:i+k] for i in range(len(seq)-k+1))

def calculate(kmer, profile, order={"A":0, "C":1, "G":2, "T":3}):
    c = []
    for i in range(0, len(kmer)):
        c.append(profile[order[kmer[i]]][i])
    return reduce(operator.mul, c, 1)
    

def profile(seq, k, profile):
    results = [(kmer, calculate(kmer,profile)) for kmer in find_kmers(seq, k)]
    return sorted(results,key=lambda x:x[1],reverse=True)[0][0]
profile(s, k, p)


In [None]:
from sys import maxsize
def distanceBetweenPatternAndStrings(p, seq): 
    k = len(p) 
    distance = 0
    for s in  seq:
        hd = maxsize 
        for i in range(0, len(s)-k+1): 
            if hd > hamming_distance(p, s[i:i+k]): 
                hd = hamming_distance(p, s[i:i+k]) 
        distance = distance + hd
    return distance
p = "AAA"
s = [i for i in "TTACCTTAAC GATATCTGTC ACGGCGTTCG CCCTAAAGAG CGTCAGAGGT".split(" ")]
distanceBetweenPatternAndStrings(p, s)

In [None]:
#Find the Mass of a Protien using monoisotopic Mass
prot_mass = {
    "A":71.03711, "C":103.00919, "D":115.02694, "E":129.04259, "F":147.06841, "G":57.02146, "H":137.05891, 
    "I":113.08406, "K":128.09496, "L":113.08406, "M":131.04049, "N":114.04293, "P":97.05276, "Q":128.05858,
    "R":156.10111, "S": 87.03203, "T":101.04768, "V": 99.06841, "W":186.07931, "Y":163.06333
}
s = "SKADYEK"
mass = 0
for i in s:
    mass += prot_mass[i]
print(mass)


In [None]:
#Predict Dominant Allele is in two organisms randomly selected from pop - Rosalind Mendels First Law
from scipy.misc import comb
hom = 27
het = 23
rec = 27
total = 4*comb(hom+het+rec, 2)
total_rec = 4*comb(rec, 2) + 2*rec*het + comb(het,2)
x = 1- (total_rec/total)
print(x)

In [None]:
#Calculating Expected Dominant displaying Offspring from population
file = open('Path','r')
pop = [float(x) for x in file.readline().split()]
multiplier = [1.0,1.0,1.0,0.75,0.5,0]

exp = 0
for x in range(6):
    exp = exp + 2* pop[x]*multiplier[x]
print(exp)

In [None]:
#Rosalind find a shared motif
from Bio import SeqIO
def shared_motif(file):
    f = open(file, "r")
    motif = ""
    #Really Dense line of code
    #Opens Fasta, extracts seq and adds to array, sorts array by length with shortest first
    seqs = sorted([line.seq for line in SeqIO.parse(f, "fasta")], key=len)
    short, the_rest = seqs[0], seqs[1:]
    for i in range(len(short)):
        for j in range(i, len(short)):
            temp_motif = short[i:j+1]
            counter = 0
            for long_seq in the_rest:
                if temp_motif in long_seq:
                    counter+=1
                else:
                    break #Break cuts down runtime alot
            if counter == len(the_rest) and len(temp_motif) > len(motif):
                motif = temp_motif 
    print(motif)
    return motif
                
            
            
shared_motif("Path")
    

In [None]:
#Rosalind Mortal Fib Rabbits, rosalind solution -- https://duphan.wordpress.com/2015/07/10/dynamic-programming-and-mortal-fibonacci-rabbits/
def rabbits(n, k=1):
    ages = [1] + [0]*(k-1)
    for i in range(n-1):
        ages = [sum(ages[1:])] + ages[:-1]
    return sum(ages)
rabbits(6, k=3)

In [None]:
#Rosalind Inferring mRNA from protien

def prot_to_mrna(seq):
    aa_count = {
        "F":2, "L":6, "I":3, "M":1, "V":4, "A":4, "T":4, "P":4, "S":6, "Y":2, "stop":3, "H":2, "Q":2, "N":2, "K":2, 
        "D":2, "E":2, "C":2, "W":1, "R":6, "G":4}
    count = 1
    for i in seq:
        count *= aa_count[i]
    count*=3
    count%= 1000000
    return count

x="MA"
prot_to_mrna(x)

In [None]:
#Rosalind Mendels Second Law
#Assume starting Pop is AaBb, each child has 2 children, each one mates with AaBb pop memeber
#Find prob that at least n organisms will belong in family at k'th genertion
import math
def mendels_second_law(k, N):                                                    
    P = 2**k                                                                       
    probability = 0                                                                
    for i in range(N, P + 1):                                                      
        prob = (math.factorial(P) /                                                
                (math.factorial(i) * math.factorial(P - i))) * (0.25**i) * (0.75**(
                    P - i))                                                        
        probability += prob                                                        
    return probability
mendels_second_law(5, 8)

In [None]:
#Rosalind Consensus Profile
import operator
from Bio import SeqIO
def profile(file):
    f = open(file)
    seqs = sorted([line.seq for line in SeqIO.parse(f, "fasta")], key=len)
    prof = [[0 for i in range(len(seqs[-1]))] for i in range(4)]

    for s in seqs:
        for i in range(len(s)):
            if s[i] == "A":
                prof[0][i] +=1
            if s[i] == "C":
                prof[1][i] +=1
            if s[i] == "G":
                prof[2][i] +=1
            if s[i] == "T":
                prof[3][i] +=1
    profile = {"A":prof[0], "C":prof[1],  "G":prof[2],"T":prof[3]}
    for i in "ACGT":
        t = ""
        for j in profile[i]:
            t += str(j)
            t+= " "
        print(i+":", t)
    return profile

def consensus(prof: dict):
    s = ""
    for i in range(len(prof["A"])):
        t = {"A":prof["A"][i], "C":prof["C"][i], "T":prof["T"][i], "G":prof["G"][i], }
        s+= max(t, key=lambda key: t[key])
    return s

print(consensus(profile("file")))

In [None]:
#Rosalind Random Strings
import math
AT = 0
GC = 0
def random_string(file):
    f = open(file, "r")
    AT=0
    GC=0
    for line in f:
        if "A" not in line:
            numbers = line.split()
            GC_contents = [float(x) for x in numbers]
        for i in line:
            if i == 'A' or i == 'T':
                AT += 1
            elif i == 'G' or i == 'C':
                GC += 1
    probs = []
    for j in range(len(GC_contents)):
        prob = math.log10((((1 - GC_contents[j]) / 2)**AT) * (GC_contents[j] / 2)
                          **GC)
        probs.append('%0.3f' % prob) #Neat trick need to look into more 
    print(*probs, sep=' ')


In [None]:
def count_bases(seq):
    counter = {}
    for i in seq:
        if i not in counter.keys():
            counter[i] = 1
        else:
            counter[i] +=1
    return counter
count_bases("AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC")

In [4]:
from Bio import ExPASy
from Bio import SwissProt
def prot_func(prot_id):
    handle = ExPASy.get_sprot_raw(prot_id)
    record = SwissProt.read(handle)
    for i in record.cross_references:
        if (i[0] == 'GO'):
            if (i[2][0] == 'P'):
                print(i[2][2:])
    
prot_func("Q0T7Q9") 

purine ribonucleoside salvage
XMP salvage


In [17]:
#Rosalind GenBank Intro
from Bio import Entrez
def gene_bank_intro(email="", genus="Bufo", dates=["2002/09/07","2011/07/18"]):
    Entrez.email = email
    term = '%s[Organism] AND ("%s"[PDAT] : "%s"[PDAT])' % (genus, dates[0], dates[1])
    handle = Entrez.esearch(db="nucleotide", term=term)
    record = Entrez.read(handle)
    print(record["Count"])
gene_bank_intro()

535


In [25]:
#Simple Fib number calc, not optimal solution
def fib_num(a=0, b=1, c=24):
    if c != 0:
        fib_num(a=b, b=a+b, c=c-1)
    else:
        print(a)
        return a
fib_num()

46368


In [49]:
#Rosalind Data Formats
from Bio import Entrez
from Bio import SeqIO
def gene_bank_intro(email="", file=""):
    f = open(file, "r")
    names = [i.split(" ") for i in f][0]
    f.close()
    Entrez.email = email
    handle = Entrez.efetch(db="nucleotide", id=names, rettype="fasta")
    records = list(SeqIO.parse(handle, "fasta"))
    ordered = sorted(records, key=len)

    print(">",ordered[0].description)
    print(ordered[0].seq)
    return ordered
gene_bank_intro()

> JX462669.1 Belgica antarctica ribosomal protein 49 (rp49) mRNA, complete cds
ATGGCAGTTCGACCAGCATTCAAACCCAAAATCATCAAAAAGAGAACGAAGAAGTTCATCCGCCATCAGTCGGACCGATATGACAAAGTCAAGGAAGCTTGGCGCAAGCCCAAGGGTATTGACAACAGAGTCAGACGTCGCTTTAAGGGACAGTACCTGATGCCAAACATCGGCTACGGTTCCAACGCCAAGACCCGCCACATGCTCCCCAACGGCTTCAAGAAGTTCACCGTCAACAACGTCCGCGAGTTGGAGGTCTTGATGATGCAAAACCGCGTTTACTGCGCCGAGGTCGCTCACGCCGTCAGCGCCAAGAAGCGTAAGCTGATCTGCGAACGTGCTAAGCAGCTGGGAATCCGTGTGACCAACTTCCACGCAAGAATGCGATCACAGGAAAATGAGTAA


[SeqRecord(seq=Seq('ATGGCAGTTCGACCAGCATTCAAACCCAAAATCATCAAAAAGAGAACGAAGAAG...TAA', SingleLetterAlphabet()), id='JX462669.1', name='JX462669.1', description='JX462669.1 Belgica antarctica ribosomal protein 49 (rp49) mRNA, complete cds', dbxrefs=[]),
 SeqRecord(seq=Seq('CGGCGGCCTCAGACTCCTTGGGTATTTGGACCACTGCACCGAAGATACCATCTC...AAA', SingleLetterAlphabet()), id='NM_013179.2', name='NM_013179.2', description='NM_013179.2 Rattus norvegicus hypocretin neuropeptide precursor (Hcrt), mRNA', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGAAGTGCACCATTTTATTGAGTTTTTTCAGCCTGATTTGGTTTGCTGGTGGA...TGA', SingleLetterAlphabet()), id='BT149870.1', name='BT149870.1', description='BT149870.1 Drosophila melanogaster RT12737 full insert cDNA', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGGCGGAACTTGTACAATCATCTCCCATTTCCGTGTCAAAAACAGAGGAACCA...TGA', SingleLetterAlphabet()), id='JX428803.1', name='JX428803.1', description='JX428803.1 Solanum tuberosum AP2 c4 mRNA, complete cds', dbxrefs=[]),
 SeqRecord(seq=Seq('AGGATTGAGAACAAGATCAAT

In [56]:
#Rosalind Fastaq-> fasta
from Bio import SeqIO
def fastaq_to_fasta(file="", out_put=""):
    with open(file, "r") as handle:
        sequences = SeqIO.parse(handle, "fastq")
        count = SeqIO.write(sequences, out_put, "fasta")
    
fastaq_to_fasta()

In [123]:
#Rosalind SPLCfinding exons and translating them
from Bio.Seq import Seq
def find_exons_translate(file):
    with open(file, "r") as f:
        seqs =[str(seq.seq) for seq in SeqIO.parse(f, 'fasta')]
        master_seq = seqs[0]
        introns = seqs[1:]
        for i in introns:
            master_seq = master_seq.replace(i, "")
        s = Seq(master_seq).translate(to_stop=True)
        print(s)
        return s
        
find_exons_translate("/Users/mark/Downloads/ros.txt")

MVYIADKQHVASREAYGHMFKVCA


Seq('MVYIADKQHVASREAYGHMFKVCA', ExtendedIUPACProtein())

2
MVYIADKQHVASREAYGHMFKVCA


In [125]:
def answer(data, n):
    # your code here
    return [i for i in data if data.count(i) <= n]
            
    
answer([1,1,2,3], 1)

[2, 3]

In [173]:
1211 % 10
(2111 - 1112) % 10

9