# Task 1

In [0]:
'''
RANDOMIZEDMOTIFSEARCH(Dna, k, t)
    randomly select k-mers Motifs = (Motif1, …, Motift) in each string
        from Dna
    BestMotifs ← Motifs
    while forever
        Profile ← Profile(Motifs)
        Motifs ← Motifs(Profile, Dna)
        if Score(Motifs) < Score(BestMotifs)
            BestMotifs ← Motifs
        else
            return BestMotifs
'''

In [0]:
import random
import numpy as np

In [0]:
def RANDOMIZEDMOTIFSEARCH(Dna, k, t):
    motifs=[]
    for dna in Dna:
      r=random.randint(0,len(dna)-k+1);
      motifs.append(dna[r:r+k])
    
    bestMotifs = motifs
    
    def Profile(motifs):
      ret = np.zeros((4,k))
      mp={'A':0,'C':1,'G':2,'T':3}
      for motif in motifs:
        for ix,i in enumerate(motif):
          ret[mp[i],ix]+=1
      return (ret+1)/(2*len(motifs))
    
    def Profile_most_Probable_k_mer(s,k,mat):
      # mat = 4 by k size
      mp={'A':0,'C':1,'G':2,'T':3}
      def prob(pat):
        val = 1.0
        for ix,i in enumerate(pat):
          val*=mat[mp[i],ix]
        return val
      mx=0
      ans=""
      for i in range(len(s)-k+1):
        m = prob(s[i:i+k])
        if m>mx:
          mx=m
          ans=s[i:i+k]
      return ans
    
    def Motifs(profile, Dna):
      motifs=[]
      for dna in Dna:
        motifs.append(Profile_most_Probable_k_mer(dna,k,profile))
      return motifs
      
    while(True):
      profile = Profile(motifs)
      motifs = Motifs(profile,Dna)
      if Score(motifs,k)<Score(bestMotifs,k):
        bestMotifs=motifs
      else:
        return bestMotifs

In [0]:
def Score(motifs,k):
      ret = np.zeros((4,k))
      mp={'A':0,'C':1,'G':2,'T':3}
      mp2=['A','C','G','T']
      for motif in motifs:
        for ix,i in enumerate(motif):
          ret[mp[i],ix]+=1
      mx = ret.argmax(axis=0)
      s=""
      for i in mx:
        s+=mp2[i]
      
      scr = 0
      for motif in motifs:
        for ix,i in enumerate(motif): 
          if i!=s[ix]:
            scr+=1
      return scr

In [0]:
k=8 
t=5
Dna='''CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA'''
Dna=Dna.split('\n')
t==len(Dna)

True

In [0]:
best=""
mx=100000000000
for i in range(1000):
  x = RANDOMIZEDMOTIFSEARCH(Dna, k, t)
  s = Score(x,k)
  if s<mx:
    mx=s
    best=x

In [0]:
for i in best:
  print(i)

TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG


# Task 2

In [0]:
from random import choices

In [0]:
def Profile_random_k_mer(s,k,mat):
      mp={'A':0,'C':1,'G':2,'T':3}
      def prob(pat):
        val = 1.0
        for ix,i in enumerate(pat):
          val*=mat[mp[i],ix]
        return val
      mx=0
      probs=[]
      for i in range(len(s)-k+1):
        probs.append(prob(s[i:i+k]))
      i = choices(list(range(len(probs))),probs)
      i=i[0]
      return s[i:i+k]

In [0]:
def Profile(motifs,k):
      ret = np.zeros((4,k))
      mp={'A':0,'C':1,'G':2,'T':3}
      for motif in motifs:
        for ix,i in enumerate(motif):
          ret[mp[i],ix]+=1
      return (ret+1)/(2*len(motifs))

In [0]:
def Score(motifs,k):
      ret = np.zeros((4,k))
      mp={'A':0,'C':1,'G':2,'T':3}
      mp2=['A','C','G','T']
      for motif in motifs:
        for ix,i in enumerate(motif):
          ret[mp[i],ix]+=1
      mx = ret.argmax(axis=0)
      s=""
      for i in mx:
        s+=mp2[i]
      
      scr = 0
      for motif in motifs:
        for ix,i in enumerate(motif): 
          if i!=s[ix]:
            scr+=1
      return scr

In [0]:
def Gibbs(Dna, k, t, N):
  motifs=[]
  for dna in Dna:
    r=random.randint(0,len(dna)-k+1);
    motifs.append(dna[r:r+k])

  bestMotifs = motifs
  
  for j in range(N):
    i=random.randint(0,t-1)
    profile = Profile(motifs[:i]+motifs[i+1:],k)
    motifs[i]=Profile_random_k_mer(Dna[i],k,profile)
    if Score(motifs,k)<Score(bestMotifs,k):
      bestMotifs=motifs
  return bestMotifs

In [0]:
k=8
t=5
N=100
Dna='''CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA'''.split('\n')
Gibbs(Dna, k, t, N)

['GTAAACGG', 'TGTAAGTG', 'GTACCGAG', 'TTCAGGTG', 'TCCACGTG']

# Task 3

In [0]:
def kmerComposition(k,s):
  ret=[]
  for i in range(len(s)-k+1):
    ret.append(s[i:i+k])
    
  return sorted(ret)

In [0]:
for i in kmerComposition(5,'CAATCCAAC'):
  print(i)

AATCC
ATCCA
CAATC
CCAAC
TCCAA


# Task 4

In [0]:
def genome(dnas):
  ret=dnas[0]
  for dna in dnas[1:]:
    ret+=dna[-1]
  return ret

In [0]:
dnas = '''ACCGA
CCGAA
CGAAG
GAAGC
AAGCT'''.split('\n')
genome(dnas)

'ACCGAAGCT'

# Task 5

In [0]:
def overlap(dnas):
  for ix,i in enumerate(dnas):
    for jx,j in enumerate(dnas):
      if ix==jx:
        continue
      if i[1:]==j[:-1]:
        print(i,'->',j)

In [0]:
s = '''ATGCG
GCATG
CATGC
AGGCA
GGCAT'''.split('\n')
overlap(s)

GCATG -> CATGC
CATGC -> ATGCG
AGGCA -> GGCAT
GGCAT -> GCATG


# Task 6

In [0]:
from collections import defaultdict 

In [0]:
def De_Bruijn(k,dna):
  mp = defaultdict(list)
  for i in range(len(dna)-k+1):
    mp[dna[i:i+k-1]].append(dna[i+1:i+k])
  return mp

In [0]:
k=4
dna='AAGATTCTCTAC'

In [0]:
for k,v in De_Bruijn(k,dna).items():
  print(k,'->',end=' ')
  for ix,i in enumerate(v):
    c=','
    if ix==len(v)-1:
      c='\n'
    print(i,end=c)

AAG -> AGA
AGA -> GAT
GAT -> ATT
ATT -> TTC
TTC -> TCT
TCT -> CTC,CTA
CTC -> TCT
CTA -> TAC


# Task 7

In [0]:
def De_Bruijn_kmer(kmers):
  mp = defaultdict(list)
  for s in kmers:
    mp[s[:-1]].append(s[1:])
  return mp

In [0]:
dnas='''GAGG
CAGG
GGGG
GGGA
CAGG
AGGG
GGAG'''.split('\n')

In [0]:
for k,v in De_Bruijn_kmer(dnas).items():
  print(k,'->',end=' ')
  for ix,i in enumerate(v):
    c=','
    if ix==len(v)-1:
      c='\n'
    print(i,end=c)

GAG -> AGG
CAG -> AGG,AGG
GGG -> GGG,GGA
AGG -> GGG
GGA -> GAG
