In [2]:
import os
import random
import pandas as pd
import numpy as np
import re
from network import Network
import csv

In [3]:
possible_factors = ["/$", "def", "end", "{", "}", "&", "\(", "\)", "nil", "var", "print", "END", "do", "int", "str", "static",
                    "public", "/+/+", "put", "return", "map" "->", "BigInt", "php", "|", "each"]
file_extensions = {".py": "python",
                   ".rb": "ruby",
                   ".ruby": "ruby",
                   ".jruby": "ruby",
                   ".python": "python",
                   ".clojure": "clojure",
                   ".php": "php",
                   ".ocaml": "ocaml",
                   ".java": "java",
                   ".javascript": "javascript",
                   ".python3": "python",
                   ".racket": "scheme",
                   ".ghc": "haskell",
                   ".tcl": "tcl",
                   ".scala": "scala"}

In [4]:
def get_training_samples(directory, extension_dict):
    "Read in the examples, kick out a list of tuples of text and classifications."
    text_list = []
    lang_list = []
    for root, dirs, files in os.walk(directory):
        for particular_file in files:
            extension = os.path.splitext(particular_file)[1]
            if extension in file_extensions:
                #print("Yay! " + particular_file)
                #print(os.path.join(root, particular_file))
                try:
                    file_in = open(os.path.join(root, particular_file))
                    text = file_in.read()
                    #print(text)
                    text_list.append(text)
                    lang_list.append(extension_dict[extension])
                except:
                    print("hiccuped on : " + particular_file)
    df = pd.DataFrame({"code": text_list,
                       "language": lang_list})
    data_array = np.array(list(zip(text_list, lang_list)))
    return data_array

In [5]:
training_data = get_training_samples("benchmarksgame-2014-08-31/", file_extensions)

hiccuped on : pidigits.ocaml-2.ocaml


In [6]:
def old_text_factors(snippet, regex_objects):
    "Takes a code snippet and returns a vector of features in [0,1]"
    normalizing_factor = len(snippet)
    factors = []
    for factor in range(len(regex_objects)):
        target = regex_objects[factor]
        total_num = len(target.findall(snippet))
        frequency = total_num / normalizing_factor
        factors.append(frequency)
    factor_array = np.array(factors)  
    factor_array = factor_array.reshape(-1, 1)
    return factor_array

In [7]:
def text_factors(snippet, regex_objects):
    "Takes a code snippet and returns a vector of features in [0,1]"
    normalizing_factor = len(snippet)
    factors = []
    for factor in range(len(regex_objects)):
        target = regex_objects[factor]
        if target.search(snippet) is None:
            value = 0
        else:
            value = 1
        factors.append(value)
    factor_array = np.array(factors)  
    factor_array = factor_array.reshape(-1, 1)
    return factor_array

In [8]:
def compile_regex_objects(factors):
    "Given possible strings in the code, return a list of regex objects that can match for them."
    objects = []
    for factor in factors:
        try:
            current_object = re.compile(factor)
        except:
            print("Problem with {}".format(factor))
        objects.append(current_object)
    return objects

In [9]:
training_data.shape

(367, 2)

In [10]:
training_data[0][1]

'clojure'

In [11]:
def lang_vectorizer(given_lang):
    languages = ["clojure", "python", "javascript", "ruby", "haskell", "scheme", "java", "scala", "php", "ocaml", "tcl"]
    index = languages.index(given_lang)
    return_array = np.zeros((len(languages),1))
    return_array[index] = 1
    return return_array
    
def create_training_data(classifier_fcn, samples, regex_objects):
    data_list = []
    for sample in samples:
        output_encoding = lang_vectorizer(sample[1])
        input_encoding = classifier_fcn(sample[0], regex_objects)
        data_list.append((input_encoding, output_encoding))
    return data_list

In [12]:
regex_objects = compile_regex_objects(possible_factors)

In [13]:
training_arrays = create_training_data(text_factors, training_data, regex_objects)

In [14]:
first_net = Network([len(training_arrays[0][0]), 20, len(training_arrays[0][1])])

In [15]:
training_arrays[1]

(array([[0],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0]]), array([[ 1.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.]]))

In [16]:
len(training_arrays)

367

In [17]:
def unvectorize_lang(vector):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    index = get_one_index(vector)
    languages = ["clojure", "python", "javascript", "ruby", "haskell", "scheme", "java", "scala", "php", "ocaml", "tcl"]
    lang = languages[index]
    return lang

def get_one_index(vector):
    count = 0
    for item in vector:
        if item == 1:
            return count
        else:
            count += 1

In [18]:
lang_vector = lang_vectorizer('ruby')

In [19]:
unvectorize_lang(lang_vector)

'ruby'

In [20]:
get_one_index(lang_vector)

3

In [21]:
from sklearn import metrics

In [22]:
from sklearn import cross_validation

In [23]:
train, test = cross_validation.train_test_split(training_arrays, test_size=0.1)

In [24]:
len(test)

37

In [25]:
trans_test = []
for item in range(len(test)):
    trans_test.append((test[item][0], get_one_index(test[item][1])))

In [26]:
first_net.SGD(train, 60, 10, 1.0, test_data=trans_test)

Epoch 0: 5 / 37
Epoch 1: 8 / 37
Epoch 2: 10 / 37
Epoch 3: 12 / 37
Epoch 4: 12 / 37
Epoch 5: 14 / 37
Epoch 6: 12 / 37
Epoch 7: 14 / 37
Epoch 8: 15 / 37
Epoch 9: 16 / 37
Epoch 10: 17 / 37
Epoch 11: 17 / 37
Epoch 12: 18 / 37
Epoch 13: 18 / 37
Epoch 14: 18 / 37
Epoch 15: 18 / 37
Epoch 16: 17 / 37
Epoch 17: 17 / 37
Epoch 18: 19 / 37
Epoch 19: 20 / 37
Epoch 20: 20 / 37
Epoch 21: 19 / 37
Epoch 22: 20 / 37
Epoch 23: 19 / 37
Epoch 24: 20 / 37
Epoch 25: 20 / 37
Epoch 26: 20 / 37
Epoch 27: 20 / 37
Epoch 28: 20 / 37
Epoch 29: 18 / 37
Epoch 30: 19 / 37
Epoch 31: 19 / 37
Epoch 32: 20 / 37
Epoch 33: 17 / 37
Epoch 34: 19 / 37
Epoch 35: 19 / 37
Epoch 36: 20 / 37
Epoch 37: 18 / 37
Epoch 38: 18 / 37
Epoch 39: 17 / 37
Epoch 40: 19 / 37
Epoch 41: 19 / 37
Epoch 42: 21 / 37
Epoch 43: 19 / 37
Epoch 44: 18 / 37
Epoch 45: 18 / 37
Epoch 46: 19 / 37
Epoch 47: 19 / 37
Epoch 48: 21 / 37
Epoch 49: 19 / 37
Epoch 50: 20 / 37
Epoch 51: 20 / 37
Epoch 52: 21 / 37
Epoch 53: 20 / 37
Epoch 54: 21 / 37
Epoch 55: 22 / 37
Epoc

In [27]:
!ls test

1  10 11 12 13 14 15 16 17 18 19 2  20 21 22 23 24 25 26 27 28 29 3  30 31 32 4  5  6  7  8  9


In [28]:
special_answer_list = []
with open("test.csv") as a_cool_file:
    answer_file = csv.reader(a_cool_file)
    for row in answer_file:
        special_answer_list.append((row[0], row[1]))

In [29]:
special_answer_list.pop(0)

('Filename', 'Language')

In [30]:
answer_database = []
for row in special_answer_list:
    with open('test/' + row[0]) as file:
        text = file.read()
        answer_database.append((text, row[1]))

In [31]:
ultimate_test = []
for row in answer_database:
    input_encoding = text_factors(row[0], regex_objects)
    output_encoding = lang_vectorizer(row[1])
    ultimate_test.append((input_encoding, output_encoding))

In [32]:
first_net.feedforward(ultimate_test[0][0])

array([[ 0.14852812],
       [ 0.0364062 ],
       [ 0.00403368],
       [ 0.08396261],
       [ 0.00194875],
       [ 0.08261208],
       [ 0.0709235 ],
       [ 0.08325061],
       [ 0.00541065],
       [ 0.01585194],
       [ 0.00440198]])

In [35]:
X, y = zip(*ultimate_test)
cross_validation.cross_val_score(first_net, X, y, scoring='accuracy')

TypeError: Cannot clone object 'A network with 3 layers.' (type <class 'network.Network'>): it does not seem to be a scikit-learn estimator it does not implement a 'get_params' methods.

In [87]:
dir(first_net)

['SGD',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'backprop',
 'biases',
 'cost_derivative',
 'evaluate',
 'feedforward',
 'num_layers',
 'predict',
 'sizes',
 'update_mini_batch',
 'weights']