In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.pipeline import Pipeline
import pandas as pd
from os import walk
import re
import glob

In [2]:
def extract_extension(string):
    match = re.match('.*\.(?P<ext>.*)$', string)
    if match:
        return match.groupdict()['ext']

In [50]:
text = ''
df = pd.DataFrame(columns=['language', 'text'])
#files = glob.glob('bench/binarytrees/*')
files = glob.glob('bench/*/*')
# ['./bench/binarytrees/binarytrees.clojure',
#          './bench/binarytrees/binarytrees.csharp']

for fn in files:
    try:
        with open(fn) as fh:
    #         df.loc[extract_extension(fn)] = ''.join(fh.readlines())
            data = {'language': extract_extension(fn),
                    'text': ''.join(fh.readlines())}
            if data['language'] and data['text']:
                df = df.append(data, ignore_index = True)
    except (IsADirectoryError, UnicodeDecodeError):
        pass
df

Unnamed: 0,language,text
0,ats,(*\n** The Computer Language Benchmarks Game\n...
1,ats,(*\n** The Computer Language Benchmarks Game\n...
2,clojure,;; The Computer Language Benchmarks Game\n;; h...
3,clojure,;; The Computer Language Benchmarks Game\n;; h...
4,clojure,;; The Computer Language Benchmarks Game\n;; h...
5,csharp,﻿/*\n The Computer Language Benchmarks Ga...
6,csharp,/* The Computer Language Benchmarks Game\n h...
7,dart,/* The Computer Language Benchmarks game\n h...
8,erlang,% The Computer Language Benchmarks Game\n% htt...
9,erlang,% The Computer Language Benchmarks Game\n% htt...


In [54]:
df[df.language == None]

Unnamed: 0,language,text


In [20]:
# walker = os.walk('./bench/binarytrees/')
# next(walker)

In [55]:
def test_classifier(pipe, *split_args):
    #classifier = classifierType()
    pipe.fit(split_args[0], split_args[2])
#     predicted = classifier.predict(X_test)
    train_score = pipe.score(split_args[0], split_args[2])
    test_score = pipe.score(split_args[1], split_args[3])
    print('Train score: {}, Test score: {}'.format(train_score, test_score))
    return pipe

In [104]:
X = df.text
y = df.language
args = train_test_split(X, y, test_size=0.2, )#random_state=0) # X_train, X_test, y_train, y_test


In [105]:
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
#                       ('tfidf', TfidfTransformer()),
                      ('bayes', MultinomialNB())])
# spam_pipe
classifier = test_classifier(spam_pipe, *args)


Train score: 0.9186991869918699, Test score: 0.7243243243243244


In [24]:
classifier.predict(args[1].iloc[2])

array(['yarv', 'yarv', 'yarv', ..., 'yarv', 'yarv', 'yarv'], 
      dtype='<U7')

####With data from a small dataset (the binarytree directory only, ~80 files), the bag_of_words/multinomial naive Bayes' pipeline got high training scores (>0.9), but low test scores (between ~0.05 and ~0.5).
####With the larger dataset ( > 900 files), the score was consistently between 0.65 and 0.75

In [106]:
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
#                       ('tfidf', TfidfTransformer()),
                      ('bayes', MultinomialNB())])
# spam_pipe
classifier = test_classifier(spam_pipe, *args)


Train score: 0.7953929539295393, Test score: 0.5783783783783784


####Adding tfidf into the pipeline made it perform significantly worse

In [118]:
from sklearn.ensemble import RandomForestClassifier

In [122]:
%time
#clf = RandomForestClassifier(n_estimators=100, )#random_state=0)
#visualize_tree(clf, X, y, boundaries=False);

spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                      ('RFC', RandomForestClassifier())])
classifier = test_classifier(spam_pipe, *args)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
Train score: 0.9688346883468835, Test score: 0.8594594594594595


####Random Forest with 10 estimators (default) is consistently > 0.80, usually in the range 0.8 to 0.9.  With n=10, it takes a few microseconds to run on the ~900 entry dataset.

In [130]:
%time
#visualize_tree(clf, X, y, boundaries=False);

spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                      ('RFC', RandomForestClassifier())])
spam_pipe.set_params(RFC__n_estimators=1000)
classifier = test_classifier(spam_pipe, *args)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs
Train score: 0.9728997289972899, Test score: 0.8756756756756757


####Not significantly better for n_estimators=1000, but it does take about 30 seconds to run

In [74]:
# from sklearn.tree import DecisionTreeClassifier, export_graphviz
# classifier = test_classifier(DecisionTreeClassifier(criterion='entropy'), *args)
# export_graphviz(classifier, out_file='tree.dot')  