In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from programming_language_classifier import get_data as gd
from programming_language_classifier import plc_trainer as plc
from sklearn.metrics import classification_report, confusion_matrix
import os

In [5]:
%matplotlib inline

In [6]:
content_list = gd.get_content("programming_language_classifier/train/")
train_data = gd.make_dataframe(content_list)
x_train, x_test, y_train, y_test = train_test_split(train_data[1], train_data[0], test_size=0.2)

In [7]:
classifier = Pipeline([('features', plc.Featurizer(plc.percent_elements, plc.number_elements,
                                                   plc.longest_run, plc.line_enders)),
                       ('bayes', MultinomialNB())])

In [8]:
classifier.fit(x_train, y_train)

Pipeline(steps=[('features', <programming_language_classifier.plc_trainer.Featurizer object at 0x10cfc9d30>), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [39]:
print("Confusion Matrix for Training Data\n")
print(confusion_matrix(classifier.predict(x_train), y_train))
print("\nTrain Score: " + str(classifier.score(x_train, y_train)))

Confusion Matrix for Training Data

[[46  0  0  0  0  0  0  0  4  0  0  0  0  0  1]
 [ 0 31  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 31  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0 21  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 29  0  1  0  0  0  0  0  0  0  1]
 [ 0  7  0  0  0 45  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 17  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 23  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  1  0 34  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 26  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 32  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0 56  0  0  0]
 [ 0  0  0  0  0  0  2  0  0  0  0  0 34  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 18  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0 38]]

Train Score: 0.954365079365


In [40]:
print("Confusion Matrix for Test Data\n")
print(confusion_matrix(classifier.predict(x_test), y_test))
print("\nTest Score: " + str(classifier.score(x_test, y_test)))

Confusion Matrix for Test Data

[[11  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  7  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 13  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  4  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  5  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  4  0  0  0  0  0  1  0  0]
 [ 0  0  0  0  0  0  0 11  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 14  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  8  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  4  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 15  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  7  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 10  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0  6]]

Test Score: 0.96062992126


In [20]:
content = []
for file in sorted(os.listdir("test/"), key=int):
    with open("test/" + file) as fh:
        content.append([fh.read()])
test_data = gd.make_dataframe(content)
test_labels = ['Clojure', 'Clojure', 'Clojure', 'Clojure', 'Python', 'Python',
       'Python', 'Python', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'Ruby', 'Ruby', 'Ruby', 'Haskell', 'Haskell',
       'Haskell', 'Scheme', 'Scheme', 'Scheme', 'Java', 'Java', 'Scala',
       'Scala', 'TCL', 'TCL', 'PHP', 'PHP', 'PHP', 'OCaml', 'OCaml']

In [36]:
print("Confusion Matrix for New Test Data\n")
print(confusion_matrix(classifier.predict(test_data[0]), test_labels))
print("\nNew Test Score: " + str(classifier.score(test_data[0], test_labels)))

Confusion Matrix for New Test Data

[[3 0 0 0 0 0 0 0 0 0 0]
 [0 3 0 0 0 0 0 0 0 0 0]
 [0 0 2 0 0 0 0 0 0 0 0]
 [0 0 0 4 0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 0 0 3 0 0 0 0 0]
 [1 0 0 0 0 0 4 0 0 0 0]
 [0 0 0 0 0 0 0 3 0 0 0]
 [0 0 0 0 0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 0 0 3 0]
 [0 0 0 0 0 0 0 0 0 0 2]]

New Test Score: 0.96875


In [34]:
print(classification_report(classifier.predict(test_data[0]), test_labels))

             precision    recall  f1-score   support

    Clojure       0.75      1.00      0.86         3
    Haskell       1.00      1.00      1.00         3
       Java       1.00      1.00      1.00         2
 JavaScript       1.00      1.00      1.00         4
      OCaml       1.00      1.00      1.00         2
        PHP       1.00      1.00      1.00         3
     Python       1.00      0.80      0.89         5
       Ruby       1.00      1.00      1.00         3
      Scala       1.00      1.00      1.00         2
     Scheme       1.00      1.00      1.00         3
        TCL       1.00      1.00      1.00         2

avg / total       0.98      0.97      0.97        32

