In [1]:
import decisiontree_gpu as dtree
import numpy as np
import pandas as pd
import time

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("DTREE").config('spark.executor.memory','6gb').getOrCreate()
sc = spark.sparkContext

In [2]:
# Cargamos los datasets
spambase = np.genfromtxt('../datasets/spambase.data', delimiter=',', dtype=np.float32)
magic = np.genfromtxt('../datasets/magic04.data', delimiter=',', dtype=np.float32)

In [3]:
"""
El experimento consiste en evaluar ambos dataset para las profundidades de 4 a 10
y comparar mi versión en CUDA con la GPU con la versión de Spark en criterios de
velocidad y precisión utilizando validación cruzada de 10 folds
"""
def validacion_cruzada(dataset, k=10, depth=5, my_min=1):
    n = dataset.shape[0]
    tam = n // k
    samples_test = []

    
    x = np.arange(n)
    for i in range(k):
        np.random.seed(i)
        samples_test.append(np.random.choice(x, tam, replace=False))
        
                
    times = np.zeros(k, np.float32)
    accuracy = np.zeros(k, np.float32)
    all_indexes = np.arange(n)
    for i in range(k):
        samples_train = np.delete(all_indexes, samples_test[i])
        inicio = time.time()
        model = dtree.train_tree(np.ascontiguousarray(dataset[samples_train]), depth, my_min)
        fin = time.time()
        times[i] = fin-inicio
        accuracy[i] = dtree.evaluar(dataset[samples_test[i]], model)

        
    return [np.mean(accuracy)*100, np.mean(times)]

In [4]:
"""
Resultados de mi versión de GPU
"""
results = []
for depth in range(4, 10):
    x = validacion_cruzada(spambase, depth=depth)
    y = validacion_cruzada(magic, depth=depth)
    x.extend(y)
    results.append(x)
    
# Usamos pandas para visualizar mejor la tabla en el notebook
pd.DataFrame(results, columns=["SPAMBASE PRECISIÓN", "SPAMBASE TIEMPO", "MAGIC PRECISIÓN", "MAGIC TIEMPO"])

Unnamed: 0,SPAMBASE PRECISIÓN,SPAMBASE TIEMPO,MAGIC PRECISIÓN,MAGIC TIEMPO
0,88.130438,2.581919,79.374337,0.360482
1,85.695648,2.916862,80.993688,0.645304
2,86.19566,4.444791,81.771821,1.173429
3,89.717388,7.318616,83.869612,2.087715
4,90.15218,10.806254,84.095687,3.516115
5,85.804349,12.151377,84.274453,5.772794


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
import numpy as np
import time
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

def pyspark_cross_validation(dataset, k=10, depth=5, my_min=1):
    accuracy = np.zeros(k, np.float32)
    times = np.zeros(k, np.float32)
    rdd1 = sc.parallelize(dataset)
    rdd1 = rdd1.map(lambda x: [float(i) for i in x])
    rdd1 = rdd1.map(lambda x: LabeledPoint(x[-1], x[:-2]))
    for i in range(k):
        (trainingData, testData) = rdd1.randomSplit([0.9, 0.1], seed = i)
        inicio = time.time()
        model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=depth, maxBins=32)
        fin = time.time()
        times[i] = fin-inicio
        predictions = model.predict(testData.map(lambda x: x.features))
        labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
        accuracy[i] = labelsAndPredictions.filter(lambda lp: lp[0] == lp[1]).count() / float(testData.count())
    rdd1.unpersist()
    return [np.mean(accuracy)*100, np.mean(times)]



In [7]:
results2 = []
x = pyspark_cross_validation(spambase, depth=4)
results2.append(x)


In [8]:
x = pyspark_cross_validation(spambase, depth=5)
results2.append(x)

In [9]:
x = pyspark_cross_validation(spambase, depth=6)
results2.append(x)

In [10]:
x = pyspark_cross_validation(spambase, depth=7)
results2.append(x)

In [11]:
x = pyspark_cross_validation(spambase, depth=8)
results2.append(x)

In [12]:
x = pyspark_cross_validation(spambase, depth=9)
results2.append(x)

In [13]:
x = pyspark_cross_validation(spambase, depth=10)
results2.append(x)

In [14]:
results3 = []
x = pyspark_cross_validation(magic, depth=4)
results3.append(x)


In [15]:
x = pyspark_cross_validation(magic, depth=5)
results3.append(x)

In [16]:
x = pyspark_cross_validation(magic, depth=6)
results3.append(x)

In [17]:
x = pyspark_cross_validation(magic, depth=7)
results3.append(x)

In [18]:
x = pyspark_cross_validation(magic, depth=8)
results3.append(x)

In [19]:
x = pyspark_cross_validation(magic, depth=9)
results3.append(x)

In [20]:
x = pyspark_cross_validation(magic, depth=10)
results3.append(x)

In [22]:
pd.DataFrame(results2, columns=["SPAMBASE PRECISIÓN", "SPAMBASE TIEMPO"])

Unnamed: 0,SPAMBASE PRECISIÓN,SPAMBASE TIEMPO
0,88.635957,6.468442
1,90.775728,6.478657
2,91.139287,6.603461
3,91.904658,6.750813
4,91.944802,6.843715
5,92.289102,6.896719
6,92.249364,7.068467


In [23]:
pd.DataFrame(results3, columns=["MAGIC PRECISIÓN", "MAGIC TIEMPO"])

Unnamed: 0,MAGIC PRECISIÓN,MAGIC TIEMPO
0,81.414253,6.415977
1,81.706637,6.482641
2,83.592957,6.582308
3,84.228885,6.712497
4,84.547967,6.814917
5,84.73047,7.048651
6,84.621972,7.086173
