In [60]:
import numpy as np
import pandas as pd
import glob

In [61]:
file_ext = {"C": ["gcc", "c", "h"],
            "C#": ["csharp"],
            "Clojure": ["clj", "cljs", "cljs", "edn", "clojure"],
            "Common Lisp": ["sbcl"],
            "Haskell": ["hs", "lhs", "ghc"],
            "Java": ["java", "class", "jar"],
            "Javascript": ["js", "javascript"],
            "OCaml": ["ocaml", "ml"],
            "Perl": ["pl", "pm", "t", "pod", "perl"],
            "PHP": ["php", "phtml", "php4", "php3", "php5", "phps", "hack"],
            "Python": ["py", "pyw", "pyc", "pyo", "pyd", "python3", "Python2"],
            "Ruby": ["rb", "rbw", "jruby", "yarv"],
            "Scala": ["scala"],
            "Scheme": ["scm", "ss", "racket"],
            "Tcl": ["tcl"]}

In [62]:
def read_bench_files():
    files = glob.glob("benchmarksgame/benchmarksgame/bench/*/*.*")
    texts = []
    for file in files:
        ext = get_ext(file.split(".")[-1])
        with open(file) as fh:
            if ext != None:
                texts.append((fh.read(), ext))
    return texts

In [63]:
def get_ext(ext):
    for key, value in file_ext.items():
        if ext in value:
            return key

In [64]:
data = read_bench_files()
data = pd.DataFrame(data, columns = ["Code", "Language"])
data.head()

Unnamed: 0,Code,Language
0,"/*\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...",C
1,"/*\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...",C
2,"/*\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...",C
3,;; The Computer Language Benchmarks Game\n;; h...,Clojure
4,;; The Computer Language Benchmarks Game\n;; h...,Clojure


In [65]:
data.Language.value_counts()

Ruby           73
C              61
PHP            55
Java           51
Scala          43
C#             41
Clojure        38
Python         36
Common Lisp    34
OCaml          34
Perl           34
Haskell        33
Scheme         29
Javascript     25
dtype: int64

In [66]:
y = data.loc[:,"Language"]
y.head()

0          C
1          C
2          C
3    Clojure
4    Clojure
Name: Language, dtype: object

In [83]:
X = data.loc[:,["Code"]]
X.head()

Unnamed: 0,Code
0,"/*\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:..."
1,"/*\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04..."
2,"/*\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04..."
3,;; The Computer Language Benchmarks Game\n;; h...
4,;; The Computer Language Benchmarks Game\n;; h...


In [84]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)

In [85]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin

In [86]:
import re

In [92]:
def char_count(char, code):
    return code.count(char)

In [101]:
def char_percent(char, code):
    return code.count(char) / len(code)

In [102]:
def string_count(string, code):
    value = len(re.findall(string, code))
    return value

In [103]:
class CodeVectorizer(TransformerMixin):
    def __init__(self):
        self.keywords = ["public", "private", "static", "if", "else", "elif", "def", "void", "int", 
                         "float", "for", "while", "import", "define", "function", "return", "format", 
                         "and", "var", "loop", "array", "local"]
        self.symbols = [":", ";", "{", "}", "(", ")", "#", "[", "]", ","]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        feature_list = []
        for code in X["Code"]:
            features = {}
            for keyword in keywords:
                features[keyword] = string_count(keyword, code)
            for symbol in symbols:
                features[symbol] = char_percent(symbol, code)
            feature_list.append(features)
        return pd.DataFrame(feature_list)

In [104]:
data.head()

Unnamed: 0,Code,Language
0,"/*\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...",C
1,"/*\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...",C
2,"/*\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...",C
3,;; The Computer Language Benchmarks Game\n;; h...,Clojure
4,;; The Computer Language Benchmarks Game\n;; h...,Clojure


In [105]:
cv = CodeVectorizer()
cv.fit(data)
cv.transform(data).head()

Unnamed: 0,#,(,),",",:,;,[,],and,array,...,loop,private,public,return,static,var,void,while,{,}
0,0.004087,0.017938,0.017938,0.010899,0.002044,0.020209,0.002271,0.002271,0,0,...,0,0,0,15,1,0,1,5,0.006585,0.006585
1,0.00447,0.020019,0.020019,0.011273,0.001944,0.018465,0.002332,0.002332,0,0,...,0,0,0,18,1,0,1,6,0.006414,0.006414
2,0.005647,0.015586,0.015586,0.010843,0.002259,0.017619,0.002259,0.002259,0,0,...,0,0,0,13,3,0,1,5,0.005421,0.005421
3,0.0,0.03595,0.03595,0.000826,0.002066,0.009091,0.008264,0.008264,1,0,...,0,0,0,0,0,0,0,0,0.0,0.0
4,0.000386,0.035852,0.035852,0.000771,0.002313,0.008096,0.00771,0.00771,2,0,...,2,0,0,0,0,0,0,0,0.0,0.0


In [106]:
y = data.loc[:,("Language")]

In [107]:
y.head()

0          C
1          C
2          C
3    Clojure
4    Clojure
Name: Language, dtype: object

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)
pipe = make_pipeline(CodeVectorizer(), DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.87234042553191493