#Initial Setup

In [1]:
from lang_classifier import *

In [2]:
def setup():
    """Load the training benchmark training data and split it for train/test"""
    df = load_bench_data()
    X = df.text
    y = df.language
    test_data = load_test_data()
    args = train_test_split(X, y, test_size=0.2, random_state=0)
    # X_train, X_test, y_train, y_test
    
    return df, X, y, test_data, args

In [3]:
df, X, y, test_data, args = setup()  # Load and split the train/test data

In [4]:
# Use a generic bag of words/naive bayes classifier pipeline as a baseline
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                      ('bayes', MultinomialNB())])
classifier = assess_classifier(spam_pipe, *args)
c = classifier.predict(X)
#print('Guesses: ', c[0:5])
test_data['guess'] = pd.DataFrame(spam_pipe.predict(test_data['text']))
correct = test_data[test_data.language == test_data.guess]
print('Proportion of test data correctly labeled: {:.3f}'.format(len(correct) / len(test_data)))
print(test_data[['language', 'guess', 'text']])

Train score: 0.991, Test score: 0.949
Proportion of test data correctly labeled: 0.094
      language       guess                                               text
0      clojure     clojure  (defn cf-settings\n  "Setup settings for campf...
1      clojure  javascript  var _ = require('lodash'),\n    fs = require('...
2      clojure     clojure  /* Riot v2.0.8, @license MIT, (c) 2015 Muut In...
3      clojure         php    var r = riot.route = function(arg) {\n    //...
4       python        ruby  module ActiveJob\n  module Core\n    extend Ac...
5       python     clojure  require 'formula'\n\nclass A52dec < Formula\n ...
6       python        ruby  module Fluent\n  class Input\n    include Conf...
7       python     haskell  {-# LANGUAGE ScopedTypeVariables, FlexibleInst...
8   javascript     haskell  reverseDependencies :: ModuleGraph -> M.Map Mo...
9   javascript     clojure  {- git-annex extra config files\n -\n - Copyri...
10  javascript      scheme  (define subst-f\n  (lambda 

In [5]:
def longest_run_of_caps_feature(text):
    runs = sorted(re.findall(r"[A-Z]+", text), key=len)
    if len(runs) == 0:
        return [0]
    longest = runs[-1]
    return [len(longest)]

In [6]:
txt = '''
# Test python program
class MyClass:
    """MyClass is a class to do something"""
    def __init__(self, name='name'):
        self.name = name
    def longest_run_of_caps_feature(text):
        runs = sorted(re.findall(r"[A-Z]+", text), key=len)
        if len(runs) == 0:
            return [0]
        longest = runs[-1]
        return [len(longest)]        
'''
featurizer = CustomFeaturizer(longest_run_of_caps_feature,
                              percent_periods_feature)
featurizer.transform([txt])

array([[ 1.        ,  0.00550964]])

##Feature Ideas
- Longest line
- bag of words with chars, ngrams, let
- run of )
- % _, }, :\n, """
- key words: def, defn, var `__x__`, module, end, ->, ::, ;, /*, //, case, final, extends, public, protected, $word, @param, self, this
- % nested dots
- () nest depth
- ignore/strip comments?
-Hyphenated or camel or underscored
-Indentation...

