-
Notifications
You must be signed in to change notification settings - Fork 1
/
simplesub.py
174 lines (128 loc) · 5.42 KB
/
simplesub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/python2.7
from string import ascii_lowercase
import sys
import re
###### HELPER FUNCTIONS ######
def scrubstring(s):
"""Takes a string, s, and returns that string after conversion to lowercase
and with all characters not present in ascii_lowercase removed. Examples:
'Test!!' -> 'test'
'I'm' -> 'im'
'Don't use multiple words...' -> 'dontusemultiplewords'"""
return ''.join(ch for ch in s.lower() if ch in ascii_lowercase)
def getpatterntuple(word):
"""Takes a string and returns a tuple which retains only character
repetition info. Examples:
'word' -> (1, 2, 3, 4)
'all' -> (1, 2, 2)
'llama' -> (1, 1, 2, 3, 2)"""
word = word.lower()
letters = {}
tup = ()
for letter in word:
if letter in letters:
tup += (letters[letter],)
else:
letternum = len(letters) + 1
tup += (letternum,)
letters[letter] = letternum
return tup
def getregex(cipherword, subs):
"""Given a ciphertext word and a dictionary of ciphertext-to-plaintext
substitutions, returns a regex that matches all and only those words which,
given additionally that their letter repetition patterns are correct, could
be the plaintext. That's a tricky explanation, so here's an example:
getregex('skmms', {'m':'l'}) returns a regex which would match any five-
letter word that has Ls where 'skmms' has Ms. So for example, it would
match 'sells', 'sills', or 'balls' (even though it doesn't fit the letter
repetition pattern). The regex would not, however, match e.g. 'shoos'."""
if len(subs) > 0:
wildcard = "[^"+"".join(str(n) for n in subs.values())+"]"
else:
wildcard = "."
regex = "".join(wildcard if ch not in subs else subs[ch] for ch in cipherword)
return re.compile("^%s$"%(regex,))
def getsubs(cipherwords, guess):
"""Takes a list or tuple of ciphertext words and a list or tuple which is
the current guess, and returns a dictionary describing every substitution
assumed by this guess. Does not perform any checks to make sure that the
guess is internally consistent. Example:
getsubs(['abb'], ['add']) -> {'a':'a', 'b':'d'}"""
subs = {}
for wordind in xrange(len(guess)):
for letterind in xrange(len(guess[wordind])):
subs[cipherwords[wordind][letterind]] = guess[wordind][letterind]
return subs
def prettyprint(ciphertext, substitutions):
"""Takes a dictionary of substitutions and performs them to the original
ciphertext, allowing us to serve up a result which isn't scrubbed of all
punctuation. This string is fit to print, but the function doesn't actually
print it."""
ans = ""
for ch in ciphertext:
if ch.lower() in substitutions:
if ch in ascii_lowercase:
ans += substitutions[ch]
else:
ans += substitutions[ch.lower()].upper()
else:
ans += ch
return ans
###### BODY -- LOADING DICTIONARY FILE ######
dictfile = "/usr/share/dict/words"
patterns = {}
with open(dictfile, 'r') as f:
for line in f: # files are iterators!! python is so cool
line = scrubstring(line)
tup = getpatterntuple(line)
if tup in patterns:
# some words (e.g. its and it's) are identical after scrubbing, so
# we're better off making sure we don't add them twice
if line not in patterns[tup]:
patterns[tup].append(line)
else:
patterns[tup] = [line]
print "Dictionary file loaded."
###### BODY -- CRACKING THE CODE ######
if len(sys.argv) < 2:
ciphertext = "pf mmwpw skmms fjppf kkms" # same as in the blog's example
else:
ciphertext = ' '.join(sys.argv[1:])
print "Ciphertext: " + ciphertext
print "Cracking..."
cipherlist = [scrubstring(word) for word in ciphertext.split(' ')
if scrubstring(word) != '']
# each entry on the stack is a tuple of guessed plaintext words
# its initial entry, the empty tuple represents the search tree's root node
stack = [()]
possibilities = []
while stack:
currguess = stack.pop()
ind = len(currguess)
if ind == len(cipherlist):
# we've got a complete guess!!
possibilities.append(currguess)
if len(possibilities) >= 15:
print "Possibilities abound! We're calling the search off early."
break
continue
# our guess isn't full yet, so let's add to it however we can
# first we make a dict describing all the substitutions we're assuming thus
# far.
subs = getsubs(cipherlist, currguess)
regex = getregex(cipherlist[ind], subs)
for guess in patterns[getpatterntuple(cipherlist[ind])]:
if regex.match(guess):
stack.append(currguess + (guess,))
###### BODY -- BOASTING OF OUR GLORIOUS SUCCESS ######
if len(possibilities) == 0:
print "No answers found... Perhaps one of the plaintext words isn't in our dictionary?"
elif len(possibilities) == 1:
print "We've got it!\n"
print prettyprint(ciphertext, getsubs(cipherlist, possibilities[0]))
else:
print "Yee dawgies!"
print "We've got more possible solutions than we know what to do with."
print "Here's %s of them:\n"%(len(possibilities),)
for possibility in possibilities:
print prettyprint(ciphertext, getsubs(cipherlist, possibility))