#Programming Language Classifier using Machine Learning

In [1]:
import pandas as pd
from scraper import scrape_clean_cut
from feature_vectors import *

##Using Two Dataframes: 
###(1) Dataframe with various languages (>200 examples from Rosetta Code) 

In [2]:
df_700x200 = pd.read_pickle('scraper_700x200.pkl')

In [3]:
y = df_700x200.loc[:, 0]
X = df_700x200.loc[:, 1]

###(2) Dataframe with only languages from test file (17 total languages)

In [25]:
filtered_df = pd.read_pickle('scraper_filtered_700x1.pkl')

In [26]:
filtered_y_train = filtered_df.loc[:, 0]
filtered_X_train = filtered_df.loc[:, 1]

#Testing only with dataset from Rosetta Code

The purpose of testing first with the rosetta code data, without the test file, is to see if my estimator is overfitting the test file data. Because there are only 32 lines of code and 11 langagues in teh test file, the test data from rosetta code includes much more sample code. 

In [6]:
rc_X_train, rc_X_test, rc_y_train, rc_y_test = train_test_split(X, y)

In [7]:
rc_pipe_bayes = make_pipe(MultinomialNB())
rc_pipe_bayes.fit(rc_X_train, rc_y_train)
rc_pipe_bayes.score(rc_X_test, rc_y_test)

0.6314634480866822

In [8]:
rc_pipe_tree = make_pipe(DecisionTreeClassifier())
rc_pipe_tree.fit(rc_X_train, rc_y_train)
rc_pipe_tree.score(rc_X_test, rc_y_test)

0.77273350706350297

In [9]:
rc_pipe_forest = make_pipe(RandomForestClassifier())
rc_pipe_forest.fit(rc_X_train, rc_y_train)
rc_pipe_forest.score(rc_X_test, rc_y_test)

0.77753394596077352

In [10]:
print((classification_report(rc_pipe_forest.predict(rc_X_test), rc_y_test)))

             precision    recall  f1-score   support

        ada       0.98      0.76      0.86       304
    algol68       0.85      0.83      0.84       126
 autohotkey       0.87      0.69      0.77       137
        awk       0.78      0.61      0.68        99
       bash       0.64      0.66      0.65       136
          c       0.85      0.76      0.81       289
    clojure       0.50      0.56      0.53        98
      cobol       0.95      0.88      0.91        60
coffeescript       0.63      0.82      0.71        45
        cpp       0.84      0.78      0.81       195
     csharp       0.82      0.78      0.80       156
          d       0.92      0.93      0.93       211
     delphi       0.69      0.72      0.70        81
          e       0.44      0.53      0.48        83
     erlang       0.86      0.86      0.86       111
   euphoria       0.64      0.79      0.71        52
    fortran       0.90      0.89      0.90       138
     fsharp       0.65      0.80      0.71  

#Testing with test samples given in folder.

In [11]:
y_test = pd.read_pickle('test_y_values.pkl')
X_test = pd.read_pickle('test_X_values.pkl')

In [12]:
y_test = y_test.loc[:, 1]
X_test = X_test.loc[:, 0]

###Estimating with Multinomial Bayes, Decision Tree, and Random Forest

In [13]:
pipe_mnb = make_pipe(MultinomialNB())
pipe_mnb.fit(X, y)
pipe_mnb.score(X_test, y_test)

0.5625

In [14]:
pipe_tree = make_pipe(DecisionTreeClassifier())
pipe_tree.fit(X, y)
pipe_tree.score(X_test, y_test)

0.5625

In [15]:
pipe_forest = make_pipe(RandomForestClassifier())
pipe_forest.fit(X, y)
pipe_forest.score(X_test, y_test)

0.78125

###Classification report to see which language is not well represented. 

In [16]:
print((classification_report(pipe_forest.predict(X_test), y_test)))

             precision    recall  f1-score   support

        ada       0.00      0.00      0.00         1
    clojure       0.50      1.00      0.67         2
        cpp       0.00      0.00      0.00         1
         go       0.00      0.00      0.00         1
    haskell       1.00      1.00      1.00         3
       java       0.50      1.00      0.67         1
 javascript       0.75      1.00      0.86         3
       lisp       0.00      0.00      0.00         2
      ocaml       1.00      1.00      1.00         2
        php       0.33      1.00      0.50         1
  purebasic       0.00      0.00      0.00         1
     python       1.00      0.80      0.89         5
       ruby       1.00      1.00      1.00         3
      scala       1.00      1.00      1.00         2
     scheme       0.67      1.00      0.80         2
        tcl       1.00      1.00      1.00         2

avg / total       0.70      0.78      0.72        32



  'precision', 'predicted', average, warn_for)


##Using Dataframe 2: only includes languages in the test sample. 

In [27]:
pipe_filtered_bayes = make_pipe(MultinomialNB())
pipe_filtered_bayes.fit(filtered_X_train, filtered_y_train)
pipe_filtered_bayes.score(X_test, y_test)

0.875

In [28]:
pipe_filtered_tree = make_pipe(DecisionTreeClassifier())
pipe_filtered_tree.fit(filtered_X_train, filtered_y_train)
pipe_filtered_tree.score(X_test, y_test)

0.78125

In [31]:
pipe_filtered_forest = make_pipe(RandomForestClassifier())
pipe_filtered_forest.fit(filtered_X_train, filtered_y_train)
pipe_filtered_forest.score(X_test, y_test)

0.875

###Classification report to see which language is not well represented

In [30]:
print((classification_report(pipe_filtered_forest.predict(X_test), y_test)))

             precision    recall  f1-score   support

    clojure       1.00      1.00      1.00         4
    haskell       1.00      1.00      1.00         3
       java       0.50      1.00      0.67         1
 javascript       0.75      1.00      0.86         3
      ocaml       1.00      1.00      1.00         2
        php       1.00      1.00      1.00         3
     python       1.00      0.57      0.73         7
       ruby       0.67      0.67      0.67         3
      scala       0.50      1.00      0.67         1
     scheme       1.00      1.00      1.00         3
        tcl       1.00      1.00      1.00         2

avg / total       0.91      0.88      0.87        32

