In [1]:
from sklearn import tree
import pandas as pd
from IPython.display import Image
import pydotplus
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
import time
import statsmodels.formula.api as smf

#### Data

The data is a letter recognition data set from UCI's machine learning repository.  There are 20,000 rows, with samples for each letter in the alphabet with fairly even distribution.

link: http://archive.ics.uci.edu/ml/datasets/Letter+Recognition

UCI's site only contains a .DATA source, so excel was used to convert the data to CSV.

In [2]:
#Import data and see shape
data=pd.read_csv("letter-recognition.csv")
data.shape

(20000, 17)

In [3]:
#See data
data.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [4]:
data.columns

Index(['lettr', 'x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar',
       'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege',
       'yegvx'],
      dtype='object')

In [5]:
#Create factors and outcome variables.  Will use all non outcome columns as factors
X=data.drop("lettr",1)
Y=data["lettr"]

In [6]:
def score_and_time(model):
    print(cross_val_score(model, X, Y, cv=10))
    print("\nRuntime:")
    print("--- %s seconds ---" % (time.time() - start_time))

In [7]:
#Decision tree
start_time = time.time()
decision_tree = tree.DecisionTreeClassifier(criterion='entropy',max_features=10,max_depth=18)
decision_tree.fit(X, Y)
score_and_time(decision_tree)

[ 0.88673621  0.87624254  0.88756219  0.88572854  0.88355822  0.85785786
  0.89468405  0.8815261   0.87933635  0.88072471]

Runtime:
--- 2.1360995769500732 seconds ---


In [8]:
start_time = time.time()
rfc = ensemble.RandomForestClassifier()
score_and_time(rfc)

[ 0.95429707  0.9388668   0.94676617  0.94411178  0.94652674  0.92992993
  0.94734203  0.95130522  0.94268477  0.93407146]

Runtime:
--- 5.149456977844238 seconds ---


Random forest took twice as long but was a bit more accurate without any tuning required.  This could obviously cause issues as the data size increases.  