In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.pipeline import Pipeline
import pandas as pd
from os import walk
import re
import glob

In [2]:
def extract_extension(string):
    match = re.match('.*\.(?P<ext>.*)$', string)
    if match:
        return match.groupdict()['ext']

In [3]:
text = ''
df = pd.DataFrame(columns=['language', 'text'])
#files = glob.glob('bench/binarytrees/*')
files = glob.glob('bench/*/*')
# ['./bench/binarytrees/binarytrees.clojure',
#          './bench/binarytrees/binarytrees.csharp']

for fn in files:
    try:
        with open(fn) as fh:
    #         df.loc[extract_extension(fn)] = ''.join(fh.readlines())
            data = {'language': extract_extension(fn),
                    'text': ''.join(fh.readlines())}
            if data['language'] and data['text']:
                df = df.append(data, ignore_index = True)
    except (IsADirectoryError, UnicodeDecodeError):
        pass
df

Unnamed: 0,language,text
0,ats,(*\n** The Computer Language Benchmarks Game\n...
1,ats,(*\n** The Computer Language Benchmarks Game\n...
2,clojure,;; The Computer Language Benchmarks Game\n;; h...
3,clojure,;; The Computer Language Benchmarks Game\n;; h...
4,clojure,;; The Computer Language Benchmarks Game\n;; h...
5,csharp,﻿/*\n The Computer Language Benchmarks Ga...
6,csharp,/* The Computer Language Benchmarks Game\n h...
7,dart,/* The Computer Language Benchmarks game\n h...
8,erlang,% The Computer Language Benchmarks Game\n% htt...
9,erlang,% The Computer Language Benchmarks Game\n% htt...


In [4]:
df[df.language == None]

Unnamed: 0,language,text


In [5]:
# walker = os.walk('./bench/binarytrees/')
# next(walker)

In [6]:
def test_classifier(pipe, *split_args):
    #classifier = classifierType()
    pipe.fit(split_args[0], split_args[2])
#     predicted = classifier.predict(X_test)
    train_score = pipe.score(split_args[0], split_args[2])
    test_score = pipe.score(split_args[1], split_args[3])
    print('Train score: {}, Test score: {}'.format(train_score, test_score))
    return pipe

In [7]:
X = df.text
y = df.language
args = train_test_split(X, y, test_size=0.2, )#random_state=0) # X_train, X_test, y_train, y_test

In [8]:
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
#                       ('tfidf', TfidfTransformer()),
                      ('bayes', MultinomialNB())])
# spam_pipe
classifier = test_classifier(spam_pipe, *args)

Train score: 0.9227642276422764, Test score: 0.7135135135135136


In [9]:
classifier.predict(args[1].iloc[2])

array(['gpp', 'gpp', 'gpp', ..., 'gpp', 'gpp', 'gpp'], 
      dtype='<U10')

####With data from a small dataset (the binarytree directory only, ~80 files), the bag_of_words/multinomial naive Bayes' pipeline got high training scores (>0.9), but low test scores (between ~0.05 and ~0.5).
####With the larger dataset ( > 900 files), the score was consistently between 0.65 and 0.75

In [10]:
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                      ('bayes', MultinomialNB())])
# spam_pipe
classifier = test_classifier(spam_pipe, *args)

Train score: 0.7859078590785907, Test score: 0.5837837837837838


####Adding tfidf into the pipeline made it perform significantly worse

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
%time
#clf = RandomForestClassifier(n_estimators=100, )#random_state=0)
#visualize_tree(clf, X, y, boundaries=False);

spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                      ('RFC', RandomForestClassifier())])
classifier = test_classifier(spam_pipe, *args)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
Train score: 0.9728997289972899, Test score: 0.8378378378378378


####Random Forest with 10 estimators (default) is consistently > 0.80, usually in the range 0.8 to 0.9.  With n=10, it takes a few microseconds to run on the ~900 entry dataset.

In [13]:
%time
#visualize_tree(clf, X, y, boundaries=False);

spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                      ('RFC', RandomForestClassifier())])
spam_pipe.set_params(RFC__n_estimators=1000)
classifier = test_classifier(spam_pipe, *args)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.11 µs
Train score: 0.975609756097561, Test score: 0.8540540540540541


####Not significantly better for n_estimators=1000, but it does take about 30 seconds to run

C (.gcc, .c)
C#
Common Lisp (.sbcl)
Clojure
Haskell
Java
JavaScript
OCaml
Perl
PHP (.hack, .php)
Python
Ruby (.jruby, .yarv)
Scala
Scheme (.racket)

In [41]:
extensions = {'C' : ['gcc', 'c'],
              'C#' : 'csharp',
              'Common Lisp' : 'sbcl',
              'Clojure' : 'clojure',
              'Haskell' : 'haskell',
              'Java' : 'java',
              'JavaScript' : 'javascript',
              'OCaml' : 'ocaml',
              'Perl' : 'perl',
              'PHP' : ['hack','php'],
              'Python' : 'python3',
              'Ruby' : ['jruby', 'yarv'],
              'Scala' : 'scala',
              'Scheme' : 'racket',
    
}

ext_lookup = {}
for key, value in extensions.items():
    """Flip the dictionary around"""
    if type(value) == type([]): #hasattr(value, '__iter__'):
        for value2 in value:
            ext_lookup[value2] = key
    else:
        ext_lookup[value] = key
ext_lookup

{'c': 'C',
 'clojure': 'Clojure',
 'csharp': 'C#',
 'gcc': 'C',
 'hack': 'PHP',
 'haskell': 'Haskell',
 'java': 'Java',
 'javascript': 'JavaScript',
 'jruby': 'Ruby',
 'ocaml': 'OCaml',
 'perl': 'Perl',
 'php': 'PHP',
 'python3': 'Python',
 'racket': 'Scheme',
 'sbcl': 'Common Lisp',
 'scala': 'Scala',
 'yarv': 'Ruby'}

In [15]:
df.language.unique()

array(['ats', 'clojure', 'csharp', 'dart', 'erlang', 'fpascal', 'fsharp',
       'gcc', 'ghc', 'gnat', 'go', 'gpp', 'hack', 'ifc', 'java',
       'javascript', 'jruby', 'lua', 'ocaml', 'oz', 'perl', 'php',
       'python3', 'racket', 'rust', 'sbcl', 'scala', 'vw', 'yarv', 'cint',
       'h', 'javasteady', 'parrot'], dtype=object)

In [16]:
# from sklearn.tree import DecisionTreeClassifier, export_graphviz
# classifier = test_classifier(DecisionTreeClassifier(criterion='entropy'), *args)
# export_graphviz(classifier, out_file='tree.dot')

In [53]:
test_data = pd.read_csv('./test.csv', names=['item', 'language', 'text', 'guess'])
test_data.head()

Unnamed: 0,item,language,text,guess
0,1,clojure,,
1,2,clojure,,
2,3,clojure,,
3,4,clojure,,
4,5,python,,


In [54]:
test_files = glob.glob('./test/*')

for (idx, fn) in enumerate(test_files):
#     try:
    with open(fn) as fh:
#         df.loc[extract_extension(fn)] = ''.join(fh.readlines())
#         data = {'language': extract_extension(fn),
#                 'text': ''.join(fh.readlines())}
#         if data['language'] and data['text']:
#             df = df.append(data, ignore_index = True)
#     except (IsADirectoryError, UnicodeDecodeError):
#         pass
        #test_data['text'][idx] = ''.join(fh.readlines())
        test_data.ix[idx, 'text'] = ''.join(fh.readlines())
test_data.head()

Unnamed: 0,item,language,text,guess
0,1,clojure,"(defn cf-settings\n ""Setup settings for campf...",
1,2,clojure,"var _ = require('lodash'),\n fs = require('...",
2,3,clojure,"/* Riot v2.0.8, @license MIT, (c) 2015 Muut In...",
3,4,clojure,var r = riot.route = function(arg) {\n //...,
4,5,python,module ActiveJob\n module Core\n extend Ac...,


In [55]:
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
#                       ('tfidf', TfidfTransformer()),
                      ('bayes', MultinomialNB())])
# spam_pipe
classifier = test_classifier(spam_pipe, *args)

Train score: 0.9227642276422764, Test score: 0.7135135135135136


In [61]:
test_data['guess'] = pd.DataFrame(spam_pipe.predict(test_data['text']))
correct = test_data[test_data.language == test_data.guess]
len(correct)/len(test_data)

0.0625

In [62]:
test_data

Unnamed: 0,item,language,text,guess
0,1,clojure,"(defn cf-settings\n ""Setup settings for campf...",clojure
1,2,clojure,"var _ = require('lodash'),\n fs = require('...",javascript
2,3,clojure,"/* Riot v2.0.8, @license MIT, (c) 2015 Muut In...",clojure
3,4,clojure,var r = riot.route = function(arg) {\n //...,lua
4,5,python,module ActiveJob\n module Core\n extend Ac...,yarv
5,6,python,require 'formula'\n\nclass A52dec < Formula\n ...,clojure
6,7,python,module Fluent\n class Input\n include Conf...,jruby
7,8,python,"{-# LANGUAGE ScopedTypeVariables, FlexibleInst...",ghc
8,9,javascript,reverseDependencies :: ModuleGraph -> M.Map Mo...,ghc
9,10,javascript,{- git-annex extra config files\n -\n - Copyri...,clojure


In [76]:
def longest_run_of_capitol_letters_feature(text):
    runs = sorted(re.findall(r"[A-Z]+", text), key=len)
    if len(runs) == 0:
        return [0]
    longest = runs[-1]
    return [len(longest)]
longest_run_of_capitol_letters_feature('ABCabddwAAAA absd AB sd A.AA.AAA')

[4]

In [77]:
def percent_periods_feature(text):
    """Return percentage of text that is periods compared to total text length."""
    periods = text.count(".")
    return [periods / len(text)]
percent_periods_feature('. . . . ')

[0.5]

In [80]:
def feature_vector(text):
    return longest_run_of_capitol_letters_feature(text) + percent_periods_feature(text)
feature_vector('AAH! feature_vector... ')

[3, 0.13043478260869565]

In [81]:
import numpy as np

class CustomFeaturizer:
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        """All SciKit-Learn compatible transformers and classifiers have the
        same interface. `fit` always returns the same object."""
        return self
        
    def transform(self, X):
        """Given a list of original data, return a list of feature vectors."""
        fvs = []
        for datum in X:
            fv = np.array([f(datum) for f in self.featurizers])
            fvs.append(fv.reshape(1, -1)[0])
        return np.array(fvs)
    
featurizer = CustomFeaturizer(longest_run_of_capitol_letters_feature,
                                  percent_periods_feature)