In [6]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.feature import Normalizer
import time
import numpy as np
from math import sqrt


# Read the dataset that was previously splitted due to jupyter's maximum file size issue
rawData = sc.textFile("./datasets/covertype.data")

parsedData = rawData.map(lambda line: [int(x) for x in line.split(',')])

def funcao1(line):
    newarray = []
    qwerty = line[3]**2 + line[4]**2
    qwerty = int(sqrt(qwerty))
    newarray.extend([line[0],qwerty,line[5],line[9]])
    #newarray.extend([qwerty])
    #newarray.extend(line[5:10])
    #i = 1
    #for j in range(10,14):
    #    if line[j] == 1:
    #        newarray.extend([i])
    #    else:
    #        i = i + 1
    i = 1
    for j in range(14,54):
        if line[j] == 1:
            newarray.extend([i])
        else:
            i = i + 1
    newarray.extend([line[54]])
    return newarray
        

parsedData2 = parsedData.map(funcao1)
parsedData2.persist()


def prepare(line):
    #values = line.split(',')
    values = line
    map(float, values)
    featureVector = Vectors.dense(values[:-1])
    # Decision tree labels varies from 0 to n-1
    label = float(values[-1])-1
    return LabeledPoint(label, featureVector)

def new_prepare(line):
    #values = line.split(',')
    values = line
    map(float, values)
    wilderness = values[9]-1
    soil = values[10]-1
    featureVector = Vectors.dense(values[0:9]+[wilderness,soil])
    # Decision tree labels varies from 0 to n-1
    label = float(values[-1])-1
    return LabeledPoint(label, featureVector)

data = parsedData2.map(lambda line : prepare(line))

trainData, cvData, testData = data.randomSplit([0.8, 0.1, 0.1])

trainData.cache()
cvData.cache()
testData.cache()

PythonRDD[197] at RDD at PythonRDD.scala:43

In [7]:
# Treino: trainData
# Teste: cvData
# Descobrir a configuração dos hyperparameters

start_time = time.time()


model = DecisionTree.trainClassifier(trainData, numClasses=7, categoricalFeaturesInfo={},
    impurity='entropy', maxDepth=20, maxBins=300)
predictions = model.predict(cvData.map(lambda x: x.features))
labelsAndPredictions = cvData.map(lambda lp: lp.label).zip(predictions) 
m = MulticlassMetrics(labelsAndPredictions)

print m.precision()
print("--- %s seconds ---" % (time.time() - start_time))

# 0.916072896684
# --- 93.2208759785 seconds ---

0.916072896684
--- 93.2208759785 seconds ---


In [8]:
# Treino: trainData + cvData
# Teste: testData
# Verificar a precisão da Decision Tree usando o conjunto de teste
start_time = time.time()


model = DecisionTree.trainClassifier(trainData.union(cvData), numClasses=7, categoricalFeaturesInfo={}, 
                                     impurity='entropy', maxDepth=20, maxBins=300)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) 
m = MulticlassMetrics(labelsAndPredictions)

print m.precision()
print("--- %s seconds ---" % (time.time() - start_time))

# 0.915808414018
# --- 102.161608934 seconds ---

0.915808414018
--- 102.161608934 seconds ---


In [None]:
# Treino: trainData + cvData
# Teste: testData
# Random Forest com 20 Decision Trees

start_time = time.time()


forest = RandomForest.trainClassifier(trainData.union(cvData), numClasses=7, categoricalFeaturesInfo={}, numTrees=20,
                                      featureSubsetStrategy='auto', impurity='entropy', maxDepth=30, maxBins=300)
predictions = forest.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) 
k = MulticlassMetrics(labelsAndPredictions)
print k.precision()


print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Treino: trainData + cvData
# Teste: testData
# Multilayer Perceptron com 1 camada oculta

# MLP não suporta feature categórica
data = rawData.map(lambda line : prepare(line))

trainData, cvData, testData = data.randomSplit([0.8, 0.1, 0.1])

trainData.cache()
cvData.cache()
testData.cache()

start_time = time.time()

trainDF = trainData.union(cvData).map(lambda x: (x.label, Normalizer().transform(x.features).asML())).toDF(["label", "features"])
mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[54, 100, 7], blockSize=128, seed=123)
model = mlp.fit(trainDF)
testDF = testData.map(lambda x: (Normalizer().transform(x.features).asML(),)).toDF(["features"])

predictions = model.transform(testDF).select("prediction")
labelsAndPredictions = testData.map(lambda x: x.label).zip(predictions.rdd.map(lambda x: x[0])) 
m = MulticlassMetrics(labelsAndPredictions)
print(m.precision())

print("--- %s seconds ---" % (time.time() - start_time))