In [1]:
import decisiontree_gpu as dtree
import numpy as np
import pandas as pd
import time

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("DTREE").getOrCreate()
sc = spark.sparkContext

In [2]:
# Cargamos los datasets
spambase = np.genfromtxt('../datasets/spambase.data', delimiter=',', dtype=np.float32)
magic = np.genfromtxt('../datasets/magic04.data', delimiter=',', dtype=np.float32)

In [4]:
results = []
for depth in range(4, 10):
    x = dtree.validacion_cruzada_np(spambase, depth=depth)
    y = dtree.validacion_cruzada_np(magic, depth=depth)
    x.extend(y)
    results.append(x)
    
# Usamos pandas para visualizar mejor la tabla en el notebook
pd.DataFrame(results, columns=["SPAMBASE PRECISIÓN", "SPAMBASE TIEMPO", "MAGIC PRECISIÓN", "MAGIC TIEMPO"])


  criteria = t_i * (n_i - t_i) / n_i + t_d * (n_d - t_d) / n_d


Unnamed: 0,SPAMBASE PRECISIÓN,SPAMBASE TIEMPO,MAGIC PRECISIÓN,MAGIC TIEMPO
0,0.880435,0.064959,0.788644,0.04624
1,0.894565,0.079373,0.815089,0.05645
2,0.903696,0.092384,0.828128,0.067961
3,0.914565,0.113011,0.83959,0.079173
4,0.920435,0.127416,0.847213,0.093886
5,0.921304,0.148836,0.84816,0.113403


In [6]:
"""
Resultados de mi versión de GPU
"""
results = []
for depth in range(4, 10):
    x = dtree.validacion_cruzada(spambase, depth=depth)
    y = dtree.validacion_cruzada(magic, depth=depth)
    x.extend(y)
    results.append(x)
    
# Usamos pandas para visualizar mejor la tabla en el notebook
pd.DataFrame(results, columns=["SPAMBASE PRECISIÓN", "SPAMBASE TIEMPO", "MAGIC PRECISIÓN", "MAGIC TIEMPO"])

Unnamed: 0,SPAMBASE PRECISIÓN,SPAMBASE TIEMPO,MAGIC PRECISIÓN,MAGIC TIEMPO
0,0.881304,0.416279,0.78796,0.089382
1,0.898044,0.638682,0.818665,0.145132
2,0.901739,0.905825,0.825237,0.235014
3,0.908696,1.366544,0.838276,0.402767
4,0.914783,1.854589,0.836909,0.657899
5,0.915,2.466846,0.845794,1.081185


In [7]:
"""
¿Y si aumentamos el número de muestras del dataset?
Repetimos muestras para aumentarlo.
"""
magic = magic[np.random.randint(magic.shape[0], size=100000),:]
spambase = spambase[np.random.randint(spambase.shape[0], size=100000),:]

In [8]:
results = []
for depth in range(4, 10):
    x = dtree.validacion_cruzada_np(spambase, depth=depth)
    y = dtree.validacion_cruzada_np(magic, depth=depth)
    x.extend(y)
    results.append(x)
    
# Usamos pandas para visualizar mejor la tabla en el notebook
pd.DataFrame(results, columns=["SPAMBASE PRECISIÓN", "SPAMBASE TIEMPO", "MAGIC PRECISIÓN", "MAGIC TIEMPO"])


Unnamed: 0,SPAMBASE PRECISIÓN,SPAMBASE TIEMPO,MAGIC PRECISIÓN,MAGIC TIEMPO
0,0.89015,1.718277,0.80192,0.327498
1,0.90405,2.191095,0.82661,0.383049
2,0.92092,2.656318,0.83903,0.445706
3,0.93208,3.231841,0.85473,0.520374
4,0.94143,3.647424,0.86485,0.594443
5,0.94952,4.021694,0.87713,0.65842


In [9]:
"""
Resultados de mi versión de GPU
"""
results = []
for depth in range(4, 10):
    x = dtree.validacion_cruzada(spambase, depth=depth)
    y = dtree.validacion_cruzada(magic, depth=depth)
    x.extend(y)
    results.append(x)
    
# Usamos pandas para visualizar mejor la tabla en el notebook
pd.DataFrame(results, columns=["SPAMBASE PRECISIÓN", "SPAMBASE TIEMPO", "MAGIC PRECISIÓN", "MAGIC TIEMPO"])

Unnamed: 0,SPAMBASE PRECISIÓN,SPAMBASE TIEMPO,MAGIC PRECISIÓN,MAGIC TIEMPO
0,0.88959,0.630719,0.79728,0.111201
1,0.90462,0.874796,0.82683,0.166451
2,0.92098,1.259747,0.83736,0.277753
3,0.93157,1.820157,0.85394,0.451911
4,0.94173,2.577246,0.8641,0.746278
5,0.94916,3.538521,0.87794,1.288471


In [3]:
rdd = sc.textFile('../datasets/SUSY.csv')
def append_ret(a,b):
    a.append(b)
    return a
# 2. Separamos y ponemos la etiqueta al final en vez de al principio
rdd = rdd.map(lambda line: line.split(','))
rdd = rdd.map(lambda line: append_ret(line[1:], line[0]))
rdd = rdd.sample(True, 0.1, seed=7)
susy = np.array(rdd.collect(), dtype=np.float32)


In [14]:
results = []
for depth in range(4, 10):
    x = dtree.validacion_cruzada(susy, depth=depth)
    y = dtree.validacion_cruzada_np(susy, depth=depth)
    x.extend(y)
    results.append(x)
    
# Usamos pandas para visualizar mejor la tabla en el notebook
pd.DataFrame(results, columns=["SUSY PRECISIÓN GPU", "SUSY TIEMPO GPU", "SUSY PRECISIÓN CPU", "SUSY TIEMPO CPU"])

Unnamed: 0,SUSY PRECISIÓN GPU,SUSY TIEMPO GPU,SUSY PRECISIÓN CPU,SUSY TIEMPO CPU
0,0.75508,0.551138,0.754656,3.607382
1,0.769218,0.701833,0.768581,4.502701
2,0.772805,0.919452,0.774129,5.270597
3,0.778611,1.244265,0.778885,6.078442
4,0.782477,1.816097,0.78156,6.803136
5,0.784257,2.878823,0.784697,7.479536


array([ 0.43781763, -1.1198827 , -1.3368225 ,  0.50231993, -1.7175149 ,
        1.017067  ,  0.21561898, -0.4612004 ,  0.3236707 ,  0.17362568,
        0.41189814,  0.37052476,  0.7982602 ,  0.6713691 ,  0.3859101 ,
        0.515522  ,  0.4791096 ,  0.0290579 ,  0.        ], dtype=float32)

In [11]:
susy = np.array(susy, dtype=np.float32)

Unnamed: 0,SPAMBASE PRECISIÓN,SPAMBASE TIEMPO
0,88.635957,6.468442
1,90.775728,6.478657
2,91.139287,6.603461
3,91.904658,6.750813
4,91.944802,6.843715
5,92.289102,6.896719
6,92.249364,7.068467


In [23]:
pd.DataFrame(results3, columns=["MAGIC PRECISIÓN", "MAGIC TIEMPO"])

Unnamed: 0,MAGIC PRECISIÓN,MAGIC TIEMPO
0,81.414253,6.415977
1,81.706637,6.482641
2,83.592957,6.582308
3,84.228885,6.712497
4,84.547967,6.814917
5,84.73047,7.048651
6,84.621972,7.086173
