In [89]:
import requests
import logging
import base64
import time

logging.basicConfig(level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

class Server(object):
    url = 'https://mlb.praetorian.com'
    log = logging.getLogger(__name__)

    def __init__(self):
        self.session = requests.session()
        self.binary  = None
        self.hash    = None
        self.wins    = 0
        self.targets = []

    def _request(self, route, method='get', data=None):
        while True:
            try:
                if method == 'get':
                    r = self.session.get(self.url + route)
                else:
                    r = self.session.post(self.url + route, data=data)
                if r.status_code == 429:
                    raise Exception('Rate Limit Exception')
                if r.status_code == 500:
                    raise Exception('Unknown Server Exception')

                return r.json()
            except Exception as e:
                self.log.error(e)
                self.log.info('Waiting 60 seconds before next request')
                time.sleep(60)

    def get(self):
        r = self._request("/challenge")
        self.targets = r.get('target', [])
        self.binary  = base64.b64decode(r.get('binary', ''))
        return r

    def post(self, target):
        r = self._request("/solve", method="post", data={"target": target})
        self.wins = r.get('correct', 0)
        self.hash = r.get('hash', self.hash)
        self.ans  = r.get('target', 'unknown')
        return r

if __name__ == "__main__":
    import random

    # create the server object
    s = Server()
    hex_strings = []
    #target_list = []
    answer = []
    for _ in range(1000):
        # query the /challenge endpoint
        s.get()
        base64_string = s.binary
        hex_strings.append(binascii.hexlify(s.binary))
        #target_list.append(s.targets)
        #print s.targets
        # choose a random target and /solve
        target = random.choice(s.targets)
        
        s.post(target)
        
        answer.append(s.ans)

        s.log.info("Guess:[{: >9}]   Answer:[{: >9}]   Wins:[{: >3}]".format(target, s.ans, s.wins))

        # 500 consecutive correct answers are required to win
        # very very unlikely with current code
        if s.hash:
            s.log.info("You win! {}".format(s.hash))

2017-02-08 18:34:26,790 - requests.packages.urllib3.connectionpool - INFO - Starting new HTTPS connection (1): mlb.praetorian.com
2017-02-08 18:34:27,063 - __main__ - INFO - Guess:[  powerpc]   Answer:[      avr]   Wins:[  0]
2017-02-08 18:34:27,243 - __main__ - INFO - Guess:[      arm]   Answer:[     s390]   Wins:[  0]
2017-02-08 18:34:27,487 - __main__ - INFO - Guess:[alphaev56]   Answer:[      sh4]   Wins:[  0]
2017-02-08 18:34:27,598 - __main__ - INFO - Guess:[   x86_64]   Answer:[alphaev56]   Wins:[  0]
2017-02-08 18:34:27,807 - __main__ - INFO - Guess:[     s390]   Answer:[      sh4]   Wins:[  0]
2017-02-08 18:34:27,928 - __main__ - INFO - Guess:[      sh4]   Answer:[alphaev56]   Wins:[  0]
2017-02-08 18:34:28,066 - __main__ - INFO - Guess:[   mipsel]   Answer:[   mipsel]   Wins:[  1]
2017-02-08 18:34:28,268 - __main__ - INFO - Guess:[   xtensa]   Answer:[   x86_64]   Wins:[  1]
2017-02-08 18:34:28,400 - __main__ - INFO - Guess:[   x86_64]   Answer:[      avr]   Wins:[  1]
2017-0

In [102]:
import pandas as pd

df = pd.DataFrame()
df.insert(0, "binary", hex_strings)
df.insert(1, "class", answer)
df.head()


Unnamed: 0,binary,class
0,089500d082e099ec909300008093000080910000909100...,avr
1,1fa724001d4810b066182154205072583050725810b060...,s390
2,61f8711884036115411189e361f87118840363e361f871...,sh4
3,0000017a520004781a011b0d1e00180000001800000000...,alphaev56
4,4100e21aa00900715663655c353d566b661f7653616d41...,sh4


In [103]:
binary_size = []

for i in df.binary:
    binary_size.append(i.count('0'))
    
df.insert(1, "zeros", binary_size)
df.to_csv("binary_data.csv", index=False)
df.head()


Unnamed: 0,binary,zeros,class
0,089500d082e099ec909300008093000080910000909100...,75,avr
1,1fa724001d4810b066182154205072583050725810b060...,34,s390
2,61f8711884036115411189e361f87118840363e361f871...,15,sh4
3,0000017a520004781a011b0d1e00180000001800000000...,68,alphaev56
4,4100e21aa00900715663655c353d566b661f7653616d41...,14,sh4


In [115]:
from sklearn.feature_extraction.text import CountVectorizer

vec_opts = {
    "ngram_range": (2, 3),  # allow n-grams of 1-4 words in length (32-bits)
    "analyzer": "word",     # analyze hex words
    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
}
v = CountVectorizer(**vec_opts)
X = v.fit_transform(hex_train, target_train)

for feature, freq in zip(v.inverse_transform(X)[0], X.A[0]):
    print("'%s' : %s" % (feature, freq))

'00 90 93' : 14
'90 91 00' : 0
'00 90 91' : 0
'00 00 90' : 0
'91 00 00' : 0
'80 91 00' : 0
'00 80 91' : 0
'80 93 00' : 0
'00 80 93' : 0
'00 00 80' : 0
'93 00 00' : 0
'90 93 00' : 0
'ec 90 93' : 0
'99 ec 90' : 0
'e0 99 ec' : 0
'82 e0 99' : 0
'd0 82 e0' : 0
'00 d0 82' : 0
'95 00 d0' : 0
'08 95 00' : 0
'90 91' : 0
'00 90' : 0
'91 00' : 0
'80 91' : 0
'80 93' : 0
'00 80' : 0
'00 00' : 0
'93 00' : 0
'90 93' : 0
'ec 90' : 0
'99 ec' : 0
'e0 99' : 0
'82 e0' : 0
'd0 82' : 0
'00 d0' : 0
'95 00' : 0
'08 95' : 0


In [81]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vec',   CountVectorizer(**vec_opts)),
    ('idf',  TfidfTransformer(**idf_opts)),
])

X = pipeline.fit_transform(hex_train, target_train)