<a href="https://colab.research.google.com/github/zephyris/discoba_alphafold/blob/add-license-1/DiscobaMMSeqs2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Generate a3m alignment using MMSeqs2 and a diverse set of Discoba species
#@markdown Ideal for generating alignment for AlphaFold2 for kinetoplastid species like _Leishmania_ and _Trypanosoma_

#@markdown Enter the query name and sequence then "Runtime>Run All"
from google.colab import files
import re
import hashlib

query_name = 'Tb927.8.4970_PFR2' #@param {type:"string"}
query_sequence = 'MLGTVDAIDYDGDRLHKVVLRFPAVRSGESEIVKEVWPCERIGQGSFGTVYRAVSSDYPRLALKISTGKSTRLRQELDVLSRVCTKGRLLLPRFEFGALNKTADLIVIGMELCVPSTLHDLLLSTRITSEAEMLFMAHQAVQAVSYVHAEGCIHRDIKLQNFVFDLDGNLKLIDFGLACNSLKPPAGDVVAGTVSFMSPEMAHNALHKDRRVSVGVAADVWSLGIVLFSIFTQRNPYPAPETPAPAAGSTPGGAGAAGVTGRGDITHGAEGEKGNDLSQQHRMNERLLRRVAAGDWQWPVGVTVSQDLKQLVNSILVVNPEERPSVSTILENKLWNLRRRYPPAAVAAFLGVQDDFLLSHDEAHLMRAVEERSAGVAASLRNSRLHSPASASNEDNGETDAQHSSSGSARNGGLNTSGATTSPARSSLKVVQRCGEGGIDGAVTVQVYDVRASTRKRSKPIREISVVMAEETAKTRRSKSARRATGAVSAPSSRVVSRAASTEYSRRIAPPAGARLQSSAAHTCANSGDDEGEAEAEAVNRGTSTSQRHSRGMSPVRLQDALETAGSVAAGQRDSVGHPKALVEPSCATPPLLTSGKQQPLSEHMRPDIQRSGSVELLEDAEAPTEASAMPTSHKRAASSGKKRRDASLRQPSSLILKGSTRDLSADAPRSTTTAASTQASTSLLASRTTLPLSAVNPSPSSSRQASLRRQASASAAAVSSAQGCAGHRGSSPVMKRAQRVALELGLDVIWHDEADHRRALSAMLLIEHAWLLASFRLTIEEDQERYSITWLAEEQEKSAAHPHRFKEVMQVMSKKYQYGFVCDMCDYEFLPTGPGEKDLHFFHCPCGRDLCPDCYTAYQRQCTCSCCRAVHSNSCVLREHLLLTGGTQYYSGSRKTNAAARADAVRGSFQAAASLNEEAESGDEASAPPEPPRRRGRPPKQDKNRSAVKQKGSRAAKDSSRRRRGAQDTLDVSVDDAHEVEQINLPRISIAAMQQQEERSSNGSHRGGGTAAVGVAPRPQRPEDVEVKQRPVESVPEGPWRPFARFKKDRRDEVAQQPTPEERDALLNGEWIRHFYLFPQAEPERVAASGTWAEGEEEPYAFVYHAQPGRTGAIFLTSDFPMHSAVFSMLERQFFVVNQVDTVEGVDSTRATSLLKAKGHPELRIAFHALQDIVAYDTNMMKQQRTPGTVSVYQAPRSAYSCNGEPFLYVRWFRFNENRTLSAFLLSNGAVQVFVNNEYELRWFDESRKFLIRYNGVCELVDDGTFALAPGINHLLYDSFDA' #@param {type:"string"}

def add_hash(x,y):
  return x+"_"+hashlib.sha1(y.encode()).hexdigest()[:5]

query_name = "".join(query_name.split())
query_name = re.sub(r'\W+', '', query_name)
query_name = add_hash(query_name, query_sequence)

with open(f"{query_name}.fasta", "w") as text_file:
    text_file.write(">1\n%s" % query_sequence)

#@markdown Searches take several minutes. The first run will set up the MMSeqs2 software and download a large sequence database so will take a little longer.

#@markdown Information about citing MMSeqs2 is available on their [Github page](https://github.com/soedinglab/MMseqs2)

#@markdown All sequence and sequencing data used in this database is publicly available, but the raw data deserves proper citation and building this database is a significant time investment. Please watch this space for full details and how to credit this resource.

In [None]:
#@title Install MMseqs2, fetch and build Discoba database
#@markdown Downloads a database from wheelerlab.net built from various publicly available non-Uniprot sources by Richard Wheeler.

#@markdown The majority of these sequences are not in the UniRef100 database, which is commonly used by online database search/alignment tools. Many are derived from raw sequencing data and are not (to my knowledge) found in any searchable databases.

#@markdown The database is currently being periodically updated. You can delete the "DISCOBA_READY" file using the "Files" interface on the left and re-run this section to update the database.

%%bash -s
#Install MMSEQS2, for search and alignment
if [ ! -f MMSEQSLOC_READY ]; then
  curl https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz -s -L -o mmseqs-linux-avx2.tar.gz
  tar -xf mmseqs-linux-avx2.tar.gz
  touch MMSEQSLOC_READY
fi

#Install CD-HIT, for reducing complexity of the database
#if [ ! -f CDHIT_READY ]; then
#  git clone https://github.com/weizhongli/cdhit
#  cd cdhit
#  make
#  cd ..
#  touch CDHIT_READY
#fi

#Download the custom Discoba database
if [ ! -f DISCOBA_READY ]; then
  if [ -d discoba ]; then
    rm -r discoba
  fi
  mkdir discoba
  cd discoba
    curl http://wheelerlab.net/discobaStats.txt    curl http://wheelerlab.net/discoba.fasta.gz -s -L -o discoba.fasta.gz
    gzip -d discoba.fasta.gz
    #cdhit/cd-hit-est -i discoba.full.fasta -o discoba.fasta
    ../mmseqs/bin/mmseqs createdb discoba.fasta discoba -v 0
  cd ..
  touch DISCOBA_READY
fi

In [None]:
#@title Run MMseqs2 to get Discoba MSA
#@markdown It is normal for this to take several minutes.
%%bash -s $query_name
MMSEQS=../mmseqs/bin/mmseqs
DISCOBA=../discoba/discoba
NAME=$1
if [ ! -f $NAME.mmseqs ]; then
  if [ -d mmseqs_$NAME ]; then
    rm -r mmseqs_$NAME
  fi
  mkdir mmseqs_$NAME
  cd mmseqs_$NAME
  cp ../$NAME.fasta query.fasta
  $MMSEQS createdb query.fasta query -v 1
  echo "Searching database"
  $MMSEQS search query $DISCOBA search tmp --num-iterations 2 -v 1
  echo "Aligning hits"
  $MMSEQS align query $DISCOBA search align -a -v 1
  echo "Formatting result"
  $MMSEQS convertalis query $DISCOBA align query.tab --format-output target,qlen,qstart,qend,tstart,tend,tseq,cigar,taln -v 0
  touch $NAME.mmseqs
fi

In [None]:
#@title Reformat MMseq2 alignment table to .a3m
mmseq_tab=open("mmseqs_"+query_name+"/query.tab", "r")
mmseq_data=mmseq_tab.readlines()
mmseq_out=open(query_name+".a3m", "w")
mmseq_count=0
for mmseq_line in mmseq_data:
  mmseq_line=mmseq_line.replace("\r", "").replace("\n", "").split("\t")
  mmseq_count+=1
  #Parse cigar
  mmseq_cigar=re.findall('(\d+)([MDI])?', mmseq_line[7])
  #Pad start of alignment
  alignment_seq="-"*(int(mmseq_line[2])-1)
  alignment_index=int(mmseq_line[4])-1;
  #Loop through cigar re-writing sequence
  for cigar_entry in mmseq_cigar:
    for sequence_index in range(0, int(cigar_entry[0])):
      if cigar_entry[1]=="M":
        alignment_seq+=mmseq_line[6][alignment_index:alignment_index+1].upper()
        alignment_index+=1
      elif cigar_entry[1]=="D":
        alignment_seq+=mmseq_line[6][alignment_index:alignment_index+1].lower()
        alignment_index+=1
      elif cigar_entry[1]=="I":
        alignment_seq+="-"
  #Pad end of alignment
  alignment_seq=alignment_seq+("-"*(int(mmseq_line[1])-int(mmseq_line[3])))
  #Print result
  mmseq_out.write(">%s\n" % mmseq_line[0])
  mmseq_out.write("%s\n" % alignment_seq)
mmseq_out.close()
print("%d sequences found" % mmseq_count)

In [None]:
#@title Download the a3m file
#@markdown This a3m file can be used as a standalone alignment for your favourite AlphaFold2 implementation.
#@markdown It uses MMSeqs2 for alignment, as used in [this colab notebook](https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/AlphaFold2.ipynb).

#@markdown In my experience it can, but does not always, give significantly better results than the [AlphaFold](https://alphafold.ebi.ac.uk/) database for _L. infantum_ and _T. cruzi_ genes.

#@markdown The content of this a3m can be appended to a a3m from searching a different database, so long as the same input sequence was used.
files.download(f"{query_name}.a3m")