#Initial Setup

In [1]:
from lang_classifier import *

In [2]:
def setup():
    """Load the training benchmark training data and split it for train/test"""
    df = load_bench_data()
    X = df.text
    y = df.language
    test_data = load_test_data()
    args = train_test_split(X, y, test_size=0.2, random_state=0)
    # X_train, X_test, y_train, y_test
    
    return df, X, y, test_data, args

In [3]:
df, X, y, test_data, args = setup()  # Load and split the train/test data

In [4]:
test_data

Unnamed: 0_level_0,language,text,guess
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,perl,use warnings;\nuse strict;\n\nmy $initial = jo...,
1,clojure,"(defn cf-settings\n ""Setup settings for campf...",
2,clojure,(ns my-cli.core)\n\n(defn -main [& args]\n (p...,
3,clojure,(extend-type String\n Person\n (first-name [...,
4,clojure,(require '[overtone.live :as overtone])\n\n(de...,
5,python,from pkgutil import iter_modules\nfrom subproc...,
6,python,import re\nimport subprocess\n\ndef cmd_keymap...,
7,python,class NoSuchService(Exception):\n def __ini...,
8,python,from collections import namedtuple\nimport fun...,
9,javascript,function errorHandler(context) {\n return fun...,


In [8]:
# Use a generic bag of words/naive bayes classifier pipeline as a baseline
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                      ('bayes', MultinomialNB())])
classifier = assess_classifier(spam_pipe, *args)
c = classifier.predict(X)
#print('Guesses: ', c[0:5])
def assess_test_data():
    test_data['guess'] = pd.DataFrame(spam_pipe.predict(test_data['text']))
    correct = test_data[test_data.language == test_data.guess]
    print('Proportion of test data correctly labeled: {:.3f}'.format(len(correct) / len(test_data)))
    print(test_data[['language', 'guess', 'text']])
assess_test_data()

Train score: 0.991, Test score: 0.949
Proportion of test data correctly labeled: 0.697
        language       guess  \
item                           
0           perl        perl   
1        clojure     clojure   
2        clojure     clojure   
3        clojure     clojure   
4        clojure     clojure   
5         python      python   
6         python     clojure   
7         python        ruby   
8         python      python   
9     javascript  javascript   
10    javascript  javascript   
11    javascript     clojure   
12    javascript         php   
13          ruby        ruby   
14          ruby     clojure   
15          ruby        ruby   
16       haskell     haskell   
17       haskell     haskell   
18       haskell     clojure   
19        scheme      scheme   
20        scheme      scheme   
21        scheme      scheme   
22          java        java   
23          java           c   
24         scala       scala   
25         scala       scala   
26           tcl 

In [9]:
cv = CountVectorizer()
ft = cv.fit_transform(X)
print(cv.get_feature_names()[-30:])
spam_pipe = Pipeline([('bag_of_words', cv),
                      ('bayes', MultinomialNB())])
classifier = assess_classifier(spam_pipe, *args)

['zin', 'zip', 'zipwith', 'zipwithindex', 'zipwithm', 'zipwithm_', 'ziv', 'ziv1', 'ziv2', 'zizi', 'zoo', 'zotov', 'zq', 'zr', 'zr1', 'zr2', 'zri', 'zrn', 'zrv', 'zrv1', 'zrv2', 'zrzi', 'zrzr', 'zs', 'zt', 'zu', 'zubach', 'zx', 'zy', 'zz']
Train score: 0.991, Test score: 0.949


In [13]:
cv = CountVectorizer(vocabulary=['}', ')', 'var', 'fn', 'function', 'end', 'defn',
                                 '===', 'lambda']) #, '(define', 'elif'])
ft = cv.fit_transform(X)
print(cv.get_feature_names()[-30:])  
spam_pipe = Pipeline([('bag_of_words', cv),
                      ('bayes', MultinomialNB())])
classifier = assess_classifier(spam_pipe, *args)
assess_test_data()

['}', ')', 'var', 'fn', 'function', 'end', 'defn', '===', 'lambda']
Train score: 0.334, Test score: 0.427
Proportion of test data correctly labeled: 0.485
        language       guess  \
item                           
0           perl        ruby   
1        clojure     clojure   
2        clojure     clojure   
3        clojure        ruby   
4        clojure     clojure   
5         python        ruby   
6         python        ruby   
7         python        ruby   
8         python      python   
9     javascript         php   
10    javascript  javascript   
11    javascript  javascript   
12    javascript  javascript   
13          ruby        ruby   
14          ruby        ruby   
15          ruby        ruby   
16       haskell        ruby   
17       haskell        ruby   
18       haskell        ruby   
19        scheme      scheme   
20        scheme      scheme   
21        scheme      scheme   
22          java        ruby   
23          java        ruby   
24         sc

Vocabulary|Results
----------|-------
} | Train score: 0.122, Test score: 0.137
}, ) | Train score: 0.122, Test score: 0.137
}, ), var | Train score: 0.161, Test score: 0.179
}, ), var, fn | Train score: 0.195, Test score: 0.248
}, ), var, fn, function| Train score: 0.287, Test score: 0.308
}, ), var, fn, function, end | Train score: 0.278, Test score: 0.325
}, ), var, fn, function, end, defn | Train score: 0.302, Test score: 0.359
}, ), var, fn, function, end, defn, === | Train score: 0.300, Test score: 0.368
}, ), var, fn, function, end, defn, ===, lambda | Train score: 0.334, Test score: 0.427
}, ), var, fn, function, end, defn, ===, lambda | Proportion of test data correctly labeled: 0.485


In [14]:
def longest_run_of_caps_feature(text):
    runs = sorted(re.findall(r"[A-Z]+", text), key=len)
    if len(runs) == 0:
        return [0]
    longest = runs[-1]
    return [len(longest)]

In [15]:
txt = '''
# Test python program
class MyClass:
    """MyClass is a class to do something"""
    def __init__(self, name='name'):
        self.name = name
    def longest_run_of_caps_feature(text):
        runs = sorted(re.findall(r"[A-Z]+", text), key=len)
        if len(runs) == 0:
            return [0]
        longest = runs[-1]
        return [len(longest)]        
'''
featurizer = CustomFeaturizer(longest_run_of_caps_feature,
                              percent_periods_feature)
featurizer.transform([txt])

array([[ 1.        ,  0.00550964]])

##Feature Ideas
- Longest line
- bag of words with chars, ngrams, let
- run of )
- % _, }, :\n, """
- key words: def, defn, var `__x__`, module, end, ->, ::, ;, /*, //, case, final, extends, public, protected, $word, @param, self, this
- % nested dots
- () nest depth
- ignore/strip comments?
-Hyphenated or camel or underscored
-Indentation...

