In [1]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import hashlib
from collections import defaultdict

In [2]:
seqlist = list(SeqIO.parse('dedup_test.fasta', 'fasta'))
print(seqlist)

[SeqRecord(seq=Seq('GGAAGGATCATTACCACACCTAAAAAACTTTTCACGTGAACCGTATCAACCCTT...CAT', SingleLetterAlphabet()), id='M01157:20:000000000-D07KA:1:1101:16943:1488', name='M01157:20:000000000-D07KA:1:1101:16943:1488', description='M01157:20:000000000-D07KA:1:1101:16943:1488 1:N:0:95', dbxrefs=[]), SeqRecord(seq=Seq('ATGATGATG', SingleLetterAlphabet()), id='DupliAbundance2', name='DupliAbundance2', description='DupliAbundance2', dbxrefs=[]), SeqRecord(seq=Seq('ATGATGATG', SingleLetterAlphabet()), id='DupliAB', name='DupliAB', description='DupliAB', dbxrefs=[]), SeqRecord(seq=Seq('ATGATGATGA', SingleLetterAlphabet()), id='singleton', name='singleton', description='singleton', dbxrefs=[]), SeqRecord(seq=Seq('TGATGATG', SingleLetterAlphabet()), id='singleton', name='singleton', description='singleton', dbxrefs=[]), SeqRecord(seq=Seq('TGATGATGTTTTTT', SingleLetterAlphabet()), id='doubletA', name='doubletA', description='doubletA', dbxrefs=[]), SeqRecord(seq=Seq('TGATGATGTTTTTT', SingleLetterAlphabe

In [3]:
for seq in seqlist:
    print(seq.id, hashlib.md5(str(seq.seq).encode()).hexdigest())

M01157:20:000000000-D07KA:1:1101:16943:1488 a9d736e0b64dc1eec21a881446210278
DupliAbundance2 c0b61c500a7ed84b735b2481734378a4
DupliAB c0b61c500a7ed84b735b2481734378a4
singleton 1cde55f05f3353288f527ebf82ed7ff3
singleton 7dea5e0207ba6e3e3b6907a5e80167ac
doubletA 447a8b3738df4b18c31cb0defd0c0d46
doubletAB 447a8b3738df4b18c31cb0defd0c0d46
TripletA f8c88ae55c6e2f5316dc09a7b5834616
SingletonNestedInTriplet 040711de850e74890e9eb1da8970fc7a
TripletAB f8c88ae55c6e2f5316dc09a7b5834616
TripletAC f8c88ae55c6e2f5316dc09a7b5834616
Quartet_1 601093ec2184835002d2929728936859
Quartet_2 601093ec2184835002d2929728936859
Quartet_3 601093ec2184835002d2929728936859
Quartet_4 601093ec2184835002d2929728936859


In [4]:
hashlist = [hashlib.md5(str(seq.seq).encode()).hexdigest() for seq in seqlist]
print(hashlist)

['a9d736e0b64dc1eec21a881446210278', 'c0b61c500a7ed84b735b2481734378a4', 'c0b61c500a7ed84b735b2481734378a4', '1cde55f05f3353288f527ebf82ed7ff3', '7dea5e0207ba6e3e3b6907a5e80167ac', '447a8b3738df4b18c31cb0defd0c0d46', '447a8b3738df4b18c31cb0defd0c0d46', 'f8c88ae55c6e2f5316dc09a7b5834616', '040711de850e74890e9eb1da8970fc7a', 'f8c88ae55c6e2f5316dc09a7b5834616', 'f8c88ae55c6e2f5316dc09a7b5834616', '601093ec2184835002d2929728936859', '601093ec2184835002d2929728936859', '601093ec2184835002d2929728936859', '601093ec2184835002d2929728936859']


In [5]:
unique = set(hashlist)
print(unique, len(unique))

{'f8c88ae55c6e2f5316dc09a7b5834616', '447a8b3738df4b18c31cb0defd0c0d46', 'a9d736e0b64dc1eec21a881446210278', '7dea5e0207ba6e3e3b6907a5e80167ac', '601093ec2184835002d2929728936859', '040711de850e74890e9eb1da8970fc7a', 'c0b61c500a7ed84b735b2481734378a4', '1cde55f05f3353288f527ebf82ed7ff3'} 8


In [11]:
abundance = defaultdict(int)
hash_to_seq = defaultdict(str)
hash_to_name = defaultdict(str)
for seq in seqlist:
    seqhash = hashlib.md5(str(seq.seq).encode()).hexdigest()
    abundance[seqhash] += 1
    hash_to_seq[seqhash] = seq.seq
    hash_to_name[seqhash] = seq.id
    print("{0}:\t\t\t{1}".format(seq.id, seqhash))

M01157:20:000000000-D07KA:1:1101:16943:1488:			a9d736e0b64dc1eec21a881446210278
DupliAbundance2:			c0b61c500a7ed84b735b2481734378a4
DupliAB:			c0b61c500a7ed84b735b2481734378a4
singleton:			1cde55f05f3353288f527ebf82ed7ff3
singleton:			7dea5e0207ba6e3e3b6907a5e80167ac
doubletA:			447a8b3738df4b18c31cb0defd0c0d46
doubletAB:			447a8b3738df4b18c31cb0defd0c0d46
TripletA:			f8c88ae55c6e2f5316dc09a7b5834616
SingletonNestedInTriplet:			040711de850e74890e9eb1da8970fc7a
TripletAB:			f8c88ae55c6e2f5316dc09a7b5834616
TripletAC:			f8c88ae55c6e2f5316dc09a7b5834616
Quartet_1:			601093ec2184835002d2929728936859
Quartet_2:			601093ec2184835002d2929728936859
Quartet_3:			601093ec2184835002d2929728936859
Quartet_4:			601093ec2184835002d2929728936859


In [7]:
abundance

defaultdict(int,
            {'040711de850e74890e9eb1da8970fc7a': 1,
             '1cde55f05f3353288f527ebf82ed7ff3': 1,
             '447a8b3738df4b18c31cb0defd0c0d46': 2,
             '601093ec2184835002d2929728936859': 4,
             '7dea5e0207ba6e3e3b6907a5e80167ac': 1,
             'a9d736e0b64dc1eec21a881446210278': 1,
             'c0b61c500a7ed84b735b2481734378a4': 2,
             'f8c88ae55c6e2f5316dc09a7b5834616': 3})

In [8]:
hash_to_seq

defaultdict(str,
            {'040711de850e74890e9eb1da8970fc7a': Seq('TGATGATGTTTTTTAAAAAAT', SingleLetterAlphabet()),
             '1cde55f05f3353288f527ebf82ed7ff3': Seq('ATGATGATGA', SingleLetterAlphabet()),
             '447a8b3738df4b18c31cb0defd0c0d46': Seq('TGATGATGTTTTTT', SingleLetterAlphabet()),
             '601093ec2184835002d2929728936859': Seq('GGGGGTCTTGCTTTTTTTGCGAGCCCTATCATGGCGAATGTTTGGACTTCGGTCTGGGCT', SingleLetterAlphabet()),
             '7dea5e0207ba6e3e3b6907a5e80167ac': Seq('TGATGATG', SingleLetterAlphabet()),
             'a9d736e0b64dc1eec21a881446210278': Seq('GGAAGGATCATTACCACACCTAAAAAACTTTTCACGTGAACCGTATCAACCCTT...CAT', SingleLetterAlphabet()),
             'c0b61c500a7ed84b735b2481734378a4': Seq('ATGATGATG', SingleLetterAlphabet()),
             'f8c88ae55c6e2f5316dc09a7b5834616': Seq('TGATGATGTTTTTTAAAAAA', SingleLetterAlphabet())})

In [12]:
for k, v in abundance.items():
    seqname = "{0}_{1}".format(k,v)
    seqrecord = SeqRecord(id=seqname, name=hash_to_name[k], seq=hash_to_seq[k])
    print(seqrecord)

ID: 7dea5e0207ba6e3e3b6907a5e80167ac_1
Name: singleton
Description: <unknown description>
Number of features: 0
Seq('TGATGATG', SingleLetterAlphabet())
ID: 601093ec2184835002d2929728936859_4
Name: Quartet_4
Description: <unknown description>
Number of features: 0
Seq('GGGGGTCTTGCTTTTTTTGCGAGCCCTATCATGGCGAATGTTTGGACTTCGGTCTGGGCT', SingleLetterAlphabet())
ID: f8c88ae55c6e2f5316dc09a7b5834616_3
Name: TripletAC
Description: <unknown description>
Number of features: 0
Seq('TGATGATGTTTTTTAAAAAA', SingleLetterAlphabet())
ID: c0b61c500a7ed84b735b2481734378a4_2
Name: DupliAB
Description: <unknown description>
Number of features: 0
Seq('ATGATGATG', SingleLetterAlphabet())
ID: 1cde55f05f3353288f527ebf82ed7ff3_1
Name: singleton
Description: <unknown description>
Number of features: 0
Seq('ATGATGATGA', SingleLetterAlphabet())
ID: 040711de850e74890e9eb1da8970fc7a_1
Name: SingletonNestedInTriplet
Description: <unknown description>
Number of features: 0
Seq('TGATGATGTTTTTTAAAAAAT', SingleLetterAlphabe