In [1]:
import decisiontree_gpu as dtree
import numpy as np
import pandas as pd
import time

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("DTREE").getOrCreate()
sc = spark.sparkContext

In [2]:
# Cargamos los datasets
spambase = np.genfromtxt('../datasets/spambase.data', delimiter=',', dtype=np.float32)
magic = np.genfromtxt('../datasets/magic04.data', delimiter=',', dtype=np.float32)

In [3]:
results = []
for depth in range(4, 10):
    x = dtree.validacion_cruzada_np(spambase, depth=depth, seed=2)
    y = dtree.validacion_cruzada(spambase, depth=depth, seed=2)
    x.extend(y)
    results.append(x)
    
# Usamos pandas para visualizar mejor la tabla en el notebook
pd.DataFrame(results, columns=["CPU PRECISIÓN", "CPU TIEMPO", "GPU PRECISIÓN", "GPU TIEMPO"])


  criteria = t_i * (n_i - t_i) / n_i + t_d * (n_d - t_d) / n_d


Unnamed: 0,CPU PRECISIÓN,CPU TIEMPO,GPU PRECISIÓN,GPU TIEMPO
0,0.876957,0.067564,0.876957,0.479601
1,0.896522,0.078672,0.896522,0.652294
2,0.910435,0.093386,0.910435,0.994098
3,0.918261,0.111401,0.918261,1.452723
4,0.92087,0.127914,0.921087,2.066082
5,0.923696,0.148335,0.92413,2.798648


In [4]:
"""
Resultados de mi versión de GPU
"""
results = []
for depth in range(4, 10):
    x = dtree.validacion_cruzada_np(magic, depth=depth, seed=2)
    y = dtree.validacion_cruzada(magic, depth=depth, seed=2)
    x.extend(y)
    results.append(x)
    
# Usamos pandas para visualizar mejor la tabla en el notebook
pd.DataFrame(results, columns=["CPU PRECISIÓN", "CPU TIEMPO", "GPU PRECISIÓN", "GPU TIEMPO"])

Unnamed: 0,CPU PRECISIÓN,CPU TIEMPO,GPU PRECISIÓN,GPU TIEMPO
0,0.790957,0.047542,0.790957,0.097189
1,0.819926,0.057753,0.819926,0.161047
2,0.829495,0.068663,0.829495,0.271348
3,0.840747,0.080473,0.840747,0.468826
4,0.844217,0.096888,0.844217,0.796025
5,0.849632,0.116907,0.849632,1.330511


In [5]:
ntrees = 50
rdd = sc.textFile('../datasets/SUSY.csv').repartition(ntrees)
def append_ret(a,b):
    a.append(b)
    return a
# 2. Separamos y ponemos la etiqueta al final en vez de al principio
rdd = rdd.map(lambda line: line.split(','))
rdd = rdd.map(lambda line: append_ret(line[1:], line[0]))

In [6]:
def gpu_work(max_depth=6, min_samples_per_node=1):
    def _gpu_work(data):
        # 1. Procesamos el dataset
        inp = np.asarray(list(data), dtype=np.float32)
        return [dtree.train_tree(inp, max_depth, min_samples_per_node)]    
    return _gpu_work

In [7]:
def predict(sample, trees):
    counter = 0
    for tree in trees:
        counter += dtree.predict(sample, tree)
    
    return 1 if counter > len(trees) / 2 else 0

def evaluar(dataset, trees):
    aciertos = 0
    for sample in dataset:
        etiqueta = predict(sample, trees)
        if int(etiqueta) == int(sample[-1]):
            aciertos += 1
        
    return aciertos / dataset.shape[0]

In [8]:
def eval_model_gpu(rdd, max_depth, seed):
    # Ejecutamos el modelo sobre las particiones [Profundidad 6 Iteración 0]
    # Subdividimos el RDD en Split de Training (90 %) y Split de Test (10 %)
    (trainingData, testData) = rdd.randomSplit([0.9, 0.1], seed=seed)
    # Intetamos forzar que se calcule la partición de entrenamiento
    # no afecte a la medida de tiempo.
    a = trainingData.take(1)   
    inicio = time.time()
    arboles = trainingData.mapPartitions(gpu_work(max_depth=max_depth))
    arboles = arboles.collect()
    fin = time.time()
    precision = evaluar(np.float32(testData.collect()), arboles) * 100
    print('Tiempo utilizado:', fin-inicio)
    print('Precisión', precision)


In [9]:
def cpu_work(max_depth=6, min_samples_per_node=1):
    def _cpu_work(data):
        # 1. Procesamos el dataset
        inp = np.asarray(list(data), dtype=np.float32)
        return [dtree.train_tree_np(inp, max_depth, min_samples_per_node)]    
    return _cpu_work

def eval_model_cpu(rdd, max_depth, seed):
    # Ejecutamos el modelo sobre las particiones [Profundidad 6 Iteración 0]
    # Subdividimos el RDD en Split de Training (90 %) y Split de Test (10 %)
    (trainingData, testData) = rdd.randomSplit([0.9, 0.1], seed=seed)
    # Intetamos forzar que se calcule la partición de entrenamiento
    # no afecte a la medida de tiempo.
    a = trainingData.take(1)   
    inicio = time.time()
    arboles = trainingData.mapPartitions(cpu_work(max_depth=max_depth))
    arboles = arboles.collect()
    fin = time.time()
    precision = evaluar(np.float32(testData.collect()), arboles) * 100
    print('Tiempo utilizado:', fin-inicio)
    print('Precisión', precision)


In [10]:
"""
Experimento 50 árboles de N/50 muestras = 100000 muestras por árbol
"""
eval_model_gpu(rdd, 5, 2)
eval_model_cpu(rdd, 5, 2)

Tiempo utilizado: 126.0632312297821
Precisión 77.4232566149804
Tiempo utilizado: 40.51989436149597
Precisión 77.4232566149804


In [11]:
eval_model_gpu(rdd, 6, 2)
eval_model_cpu(rdd, 6, 2)


KeyboardInterrupt: 

In [None]:
eval_model_gpu(rdd, 7, 2)
eval_model_cpu(rdd, 7, 2)

In [None]:
eval_model_gpu(rdd, 8, 2)
eval_model_cpu(rdd, 8, 2)

In [None]:
eval_model_gpu(rdd, 9, 2)
eval_model_cpu(rdd, 9, 2)