In [1]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

from string import split,strip

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

### Cover Type

Classify geographical locations according to their predicted tree cover:

* **URL:** http://archive.ics.uci.edu/ml/datasets/Covertype
* **Abstract:** Forest CoverType dataset
* **Data Set Description:** http://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info

In [3]:
# Read the file into an RDD
# If doing this on a real cluster, you need the file to be available on all nodes, ideally in HDFS.
path='covtype/covtype.data'
inputRDD=sc.textFile(path)

### Making the problem binary

In [10]:
Data=inputRDD.map(lambda line: [float(x) for x in line.split(',')]).map(lambda V:LabeledPoint(1.0, V[:-1]) if V[-1] == 2.0 else LabeledPoint(0.0, V[:-1]))

[LabeledPoint(0.0, [2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]),
 LabeledPoint(0.0, [2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]),
 LabeledPoint(1.0, [2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])]

### Reducing data size

In [11]:
#Data1=Data.sample(False,0.1).cache()
Data1=Data.cache()
(trainingData,testData)=Data.randomSplit([0.7,0.3],seed=255)

Sizes: Data1=58714, trainingData=41176, testData=17538


### Gradient Boosted Trees

In [13]:
from time import time
errors={}
for depth in [20]:
    start=time()
    #model=GradientBoostedTrees.trainClassifier(##FILLIN to generate 10 trees ##)
    model=GradientBoostedTrees.trainClassifier(trainingData, {}, numIterations=20, maxDepth=depth) # depth?
    #print model.toDebugString()
    errors[depth]={}
    dataSets={'train':trainingData,'test':testData}
    for name in dataSets.keys():  # Calculate errors on train and test sets
        data=dataSets[name]
        Predicted=model.predict(data.map(lambda x: x.features))
        #LabelsAndPredictions=data. ### FILLIN ###
        LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted)
        Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
        errors[depth][name]=Err
    print depth,errors[depth]

1 {'test': 0.27403352719808416, 'train': 0.2751602875461434} 11 seconds
3 {'test': 0.25424791880488085, 'train': 0.25155430347775404} 10 seconds
6 {'test': 0.21849697799064888, 'train': 0.21177384884398678} 11 seconds
10 {'test': 0.17253962823583077, 'train': 0.13969302506314357} 26 seconds
{1: {'test': 0.27403352719808416, 'train': 0.2751602875461434}, 10: {'test': 0.17253962823583077, 'train': 0.13969302506314357}, 3: {'test': 0.25424791880488085, 'train': 0.25155430347775404}, 6: {'test': 0.21849697799064888, 'train': 0.21177384884398678}}


### Random Forests

In [16]:
from pyspark.mllib.tree import RandomForest, RandomForestModel

from time import time
errors={}
for depth in [30]:
    start=time()
    model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=20, maxDepth=depth) #FILLIN
    #print model.toDebugString()
    errors[depth]={}
    dataSets={'train':trainingData,'test':testData}
    for name in dataSets.keys():  # Calculate errors on train and test sets
        ### FILLIN ###
        data=dataSets[name]
        Predicted = model.predict(data.map(lambda x: x.features))
        LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted)
        Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
        errors[depth][name]=Err
    print depth,errors[depth]

1 {'test': 0.4887102292165583, 'train': 0.48637555857781234} 2 seconds
3 {'test': 0.2940472117687308, 'train': 0.29740625607149795} 2 seconds
6 {'test': 0.2503136047439845, 'train': 0.2514085875267146} 3 seconds
10 {'test': 0.23942296727106854, 'train': 0.23533126092869633} 5 seconds
15 {'test': 0.20122020754932146, 'train': 0.18367495628521469} 7 seconds
20 {'test': 0.18314517048694265, 'train': 0.1570089372449971} 9 seconds
{1: {'test': 0.4887102292165583, 'train': 0.48637555857781234}, 3: {'test': 0.2940472117687308, 'train': 0.29740625607149795}, 6: {'test': 0.2503136047439845, 'train': 0.2514085875267146}, 10: {'test': 0.23942296727106854, 'train': 0.23533126092869633}, 15: {'test': 0.20122020754932146, 'train': 0.18367495628521469}, 20: {'test': 0.18314517048694265, 'train': 0.1570089372449971}}
