-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path187 Repeated DNA Sequences.py
83 lines (65 loc) · 1.84 KB
/
187 Repeated DNA Sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying
DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",
Return:
["AAAAACCCCC", "CCCCCAAAAA"].
Rajeev Ranjan
"""
class Solution:
def findRepeatedDnaSequences(self, s):
"""
Limited space of possible values --> rewrite hash function
Rolling hash
"A": 0 (00)
"C": 1 (01)
"G": 2 (10)
"T": 3 (11)
:type s: str
:rtype: list[str]
"""
if len(s) < 10:
return []
s = map(self.mapping, list(s))
h = set()
# in_ret = set()
ret = set()
cur = 0
for i in xrange(10):
cur <<= 2
cur &= 0xFFFFF
cur += s[i]
h.add(cur)
for i in xrange(10, len(s)):
cur <<= 2
cur &= 0xFFFFF # 10 * 2 = 20 position
cur += s[i]
if cur in h and cur not in ret:
ret.add(cur)
else:
h.add(cur)
return map(self.decode, ret)
def decode(self, s):
dic = {
0: "A",
1: "C",
2: "G",
3: "T"
}
ret = []
for i in xrange(10):
ret.append(dic[s%4])
s >>= 2
return "".join(reversed(ret))
def mapping(self, a):
dic = {
"A": 0,
"C": 1,
"G": 2,
"T": 3,
}
return dic[a]
if __name__ == "__main__":
assert Solution().findRepeatedDnaSequences("AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT") == ['CCCCCAAAAA', 'AAAAACCCCC']