In [171]:
import pandas as pd

df = pd.read_csv("binary_data.csv")

df.head()

Unnamed: 0,binary,zeros,class
0,089500d082e099ec909300008093000080910000909100...,75,avr
1,1fa724001d4810b066182154205072583050725810b060...,34,s390
2,61f8711884036115411189e361f87118840363e361f871...,15,sh4
3,0000017a520004781a011b0d1e00180000001800000000...,68,alphaev56
4,4100e21aa00900715663655c353d566b661f7653616d41...,14,sh4


In [172]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
binary    1000 non-null object
zeros     1000 non-null int64
class     1000 non-null object
dtypes: int64(1), object(2)
memory usage: 23.5+ KB


In [232]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

X = df.binary
y = df["class"]
vec_opts = {
    "ngram_range": (2, 3),  # allow n-grams of 1-4 words in length (32-bits)
    "analyzer": "word",     # analyze hex words
    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
    "min_df" : 0.001
}
v = CountVectorizer(**vec_opts)


In [233]:
from sklearn.feature_extraction.text import TfidfTransformer

idf_opts = {"use_idf": True}
idf = TfidfTransformer(**idf_opts)

In [234]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vec',   CountVectorizer(**vec_opts)),
    ('idf',  TfidfTransformer(**idf_opts)),
])

X = pipeline.fit_transform(X, y)

In [235]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.grid_search import GridSearchCV

clf = MultinomialNB().fit(X, y)
all_predictions = clf.predict(X)


In [236]:
from sklearn.metrics import classification_report
print classification_report(y, all_predictions)

             precision    recall  f1-score   support

  alphaev56       1.00      1.00      1.00        86
        arm       1.00      1.00      1.00       105
        avr       1.00      1.00      1.00        76
       m68k       1.00      0.99      0.99        76
       mips       1.00      1.00      1.00        71
     mipsel       0.99      1.00      0.99        96
    powerpc       1.00      1.00      1.00        84
       s390       1.00      1.00      1.00        72
        sh4       1.00      0.98      0.99        92
      sparc       1.00      1.00      1.00        72
     x86_64       0.98      1.00      0.99        90
     xtensa       1.00      1.00      1.00        80

avg / total       1.00      1.00      1.00      1000



In [237]:
import requests
import logging
import base64
import time

logging.basicConfig(level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

class Server(object):
    url = 'https://mlb.praetorian.com'
    log = logging.getLogger(__name__)

    def __init__(self):
        self.session = requests.session()
        self.binary  = None
        self.hash    = None
        self.wins    = 0
        self.targets = []

    def _request(self, route, method='get', data=None):
        while True:
            try:
                if method == 'get':
                    r = self.session.get(self.url + route)
                else:
                    r = self.session.post(self.url + route, data=data)
                if r.status_code == 429:
                    raise Exception('Rate Limit Exception')
                if r.status_code == 500:
                    raise Exception('Unknown Server Exception')

                return r.json()
            except Exception as e:
                self.log.error(e)
                self.log.info('Waiting 60 seconds before next request')
                time.sleep(60)

    def get(self):
        r = self._request("/challenge")
        self.targets = r.get('target', [])
        self.binary  = base64.b64decode(r.get('binary', ''))
        return r

    def post(self, target):
        r = self._request("/solve", method="post", data={"target": target})
        self.wins = r.get('correct', 0)
        self.hash = r.get('hash', self.hash)
        self.ans  = r.get('target', 'unknown')
        return r

if __name__ == "__main__":
    import random
    s = Server()
    var = 1
    while var == 1:
        # query the /challenge endpoint
        s.get()
        # query the /challenge endpoint        
        #print binascii.hexlify(s.binary)
        
        test_binary = binascii.hexlify(s.binary)
       
        X_test = pipeline.transform([test_binary])

        test_prediction = clf.predict(X_test)
        
        s.post(test_prediction)
        
        s.log.info("Guess:[{: >9}]   Answer:[{: >9}]   Wins:[{: >3}]".format(test_prediction, s.ans, s.wins))

        # 500 consecutive correct answers are required to win
        # very very unlikely with current code
        if s.hash:
            s.log.info("You win! {}".format(s.hash))

2017-02-08 20:44:05,880 - requests.packages.urllib3.connectionpool - INFO - Starting new HTTPS connection (1): mlb.praetorian.com
2017-02-08 20:44:06,201 - __main__ - INFO - Guess:[  ['arm']]   Answer:[      arm]   Wins:[  1]
2017-02-08 20:44:06,344 - __main__ - INFO - Guess:[  ['avr']]   Answer:[      avr]   Wins:[  2]
2017-02-08 20:44:06,510 - __main__ - INFO - Guess:[['powerpc']]   Answer:[  powerpc]   Wins:[  3]
2017-02-08 20:44:06,627 - __main__ - INFO - Guess:[ ['s390']]   Answer:[     s390]   Wins:[  4]
2017-02-08 20:44:06,747 - __main__ - INFO - Guess:[['alphaev56']]   Answer:[alphaev56]   Wins:[  5]
2017-02-08 20:44:06,881 - __main__ - INFO - Guess:[ ['s390']]   Answer:[     s390]   Wins:[  6]
2017-02-08 20:44:06,997 - __main__ - INFO - Guess:[  ['avr']]   Answer:[      avr]   Wins:[  7]
2017-02-08 20:44:07,114 - __main__ - INFO - Guess:[['x86_64']]   Answer:[   x86_64]   Wins:[  8]
2017-02-08 20:44:07,295 - __main__ - INFO - Guess:[['alphaev56']]   Answer:[alphaev56]   Wins:[

KeyboardInterrupt: 