In [1]:
import numpy as np
import findspark
from functions.required_functions import (
    readFile,
    normalize,
    transform,
    get_block_data,
    train,
    accuracy,
)
from pyspark import SparkContext
import time
import json
import matplotlib.pyplot as plt
findspark.init()

In [2]:

# ==============================
# EXPERIMENT ORCHESTRATOR
# =============================
def run_experiment(partitions=None,nodes=None,file_path="data/botnet_reduced_10k_l.csv",nIter=20,learning_rate=1.5,lambda_reg=0,):
    results = []
    # Experimento variando partitions
    if partitions is not None:
        for n_partitions in partitions:
            result = _configurate_experiment(file_path,nIter,learning_rate,lambda_reg,n_partitions=n_partitions,)
            result["partitions"] = n_partitions
            results.append(result)
        # Guardar resultados en JSON
        with open("results/experiment4.json", "w") as f:
            json.dump(results, f, indent=4)
        _plot_results(
            results,
            x_key="partitions",
            title="Execution Time vs Partitions",
            xlabel="Number of partitions",
            filename="results/execution_time_vs_partitions.png",
        )
        _plot_stacked_times(
            results,
            x_key="partitions",
            filename="results/stacked_times_vs_partitions.png",
        )
    # Experimento variando workers
    elif nodes is not None:
        for n_node in nodes:
            result = _configurate_experiment(file_path,nIter,learning_rate,lambda_reg,n_nodes=n_node,)
            result["nodes"] = n_node
            results.append(result)
        # Guardar resultados en JSON
        with open("results/experiment4.json", "w") as f:
            json.dump(results, f, indent=4)

        _plot_results(
            results,
            x_key="nodes",
            title="Execution Time vs Workers",
            xlabel="Number of workers",
            filename="results/execution_time_vs_workers.png",
        )

        _plot_stacked_times(
            results,
            x_key="nodes",
            filename="results/stacked_times_vs_workers.png",
        )
    return results


# ==============================
# SPARK CONTEXT LIFECYCLE
# ==============================
def _configurate_experiment(file_path,nIter,learning_rate,lambda_reg,n_partitions=None,n_nodes="*",):
    sc = SparkContext(
        master=f"local[{n_nodes}]",
        appName="Experiment_4",
    )
    try:
        result = _run_training(file_path,nIter,learning_rate,lambda_reg,sc,n_partitions,)
    finally:
        sc.stop()
    return result
# ==============================
# TRAINING PIPELINE
# ==============================


def _run_training(
    file_path,
    nIter,
    learning_rate,
    lambda_reg,
    sc,
    n_partitions,
):
    # READ
    start = time.time()
    data = readFile(file_path, sc)
    read_time = time.time() - start
    # REPARTITION
    repartition_time = 0
    if n_partitions is not None:
        start = time.time()
        data = data.repartition(n_partitions).cache()
        # Fuerza shuffle
        data.count()
        repartition_time = time.time() - start

    # NORMALIZE
    start = time.time()
    data = normalize(data).cache()
    data.count()
    normalize_time = time.time() - start
    # TRAIN
    start = time.time()
    w, b = train(
        data,
        nIter,
        learning_rate,
        lambda_reg,
    )
    train_time = time.time() - start
    # ACCURACY
    start = time.time()
    acc = accuracy(w, b, data)
    acc_calc_time = time.time() - start
    print(f"Accuracy: {acc:.4f}")
    total_time = (
        read_time + repartition_time + normalize_time + train_time + acc_calc_time
    )
    print(f"Tiempo total experimento: {total_time:.4f}s")

    return {
        "read_time": read_time,
        "repartition_time": repartition_time,
        "normalize_time": normalize_time,
        "train_time": train_time,
        "acc_calc_time": acc_calc_time,
        "total_time": total_time,
    }

# ==============================
# PLOTTING
# ==============================
def _plot_results(results, x_key, title, xlabel, filename):
    x = [r[x_key] for r in results]
    y = [r["total_time"] for r in results]

    plt.figure()
    plt.plot(x, y, marker="o")
    plt.xlabel(xlabel)
    plt.ylabel("Execution Time (s)")
    plt.title(title)
    plt.savefig(filename)
    plt.close()


def _plot_stacked_times(results, x_key, filename):
    x_labels = [r[x_key] for r in results]

    steps = ["read_time", "repartition_time", "normalize_time", "train_time", "acc_calc_time"]
    colors = ["#FF9999", "#66B2FF", "#99FF99", "#FFCC66", "#C266FF"]

    bottoms = [0]*len(results)
    plt.figure(figsize=(10, 6))

    for step, color in zip(steps, colors):
        heights = [r[step] for r in results]
        plt.bar(x_labels, heights, bottom=bottoms, color=color, label=step)
        bottoms = [b + h for b, h in zip(bottoms, heights)]

    plt.xlabel(x_key)
    plt.ylabel("Time (s)")
    plt.title("Time per step (stacked)")
    plt.legend()
    plt.savefig(filename)
    plt.close()

In [3]:
run_experiment(list(range(1, 13)))


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/21 13:27:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Iteration 0/20, Cost: 0.4864, Accuracy: 0.7965
Iteration 2/20, Cost: 0.2997, Accuracy: 0.9125
Iteration 4/20, Cost: 0.2561, Accuracy: 0.9216
Iteration 6/20, Cost: 0.2347, Accuracy: 0.9267
Iteration 8/20, Cost: 0.2215, Accuracy: 0.9287
Iteration 10/20, Cost: 0.2123, Accuracy: 0.9302
Iteration 12/20, Cost: 0.2056, Accuracy: 0.9308
Iteration 14/20, Cost: 0.2004, Accuracy: 0.9314
Iteration 16/20, Cost: 0.1963, Accuracy: 0.9316
Iteration 18/20, Cost: 0.1929, Accuracy: 0.9319
Accuracy: 0.9322
Tiempo total experimento: 15.4145s


                                                                                

Iteration 0/20, Cost: 0.4685, Accuracy: 0.7883
Iteration 2/20, Cost: 0.3026, Accuracy: 0.9038
Iteration 4/20, Cost: 0.2563, Accuracy: 0.9194
Iteration 6/20, Cost: 0.2340, Accuracy: 0.9246
Iteration 8/20, Cost: 0.2206, Accuracy: 0.9271
Iteration 10/20, Cost: 0.2115, Accuracy: 0.9297
Iteration 12/20, Cost: 0.2048, Accuracy: 0.9306
Iteration 14/20, Cost: 0.1997, Accuracy: 0.9307
Iteration 16/20, Cost: 0.1956, Accuracy: 0.9317
Iteration 18/20, Cost: 0.1924, Accuracy: 0.9326
Accuracy: 0.9325
Tiempo total experimento: 11.5992s
Iteration 0/20, Cost: 0.5701, Accuracy: 0.7380
Iteration 2/20, Cost: 0.3285, Accuracy: 0.8864
Iteration 4/20, Cost: 0.2664, Accuracy: 0.9134
Iteration 6/20, Cost: 0.2390, Accuracy: 0.9211
Iteration 8/20, Cost: 0.2233, Accuracy: 0.9263
Iteration 10/20, Cost: 0.2130, Accuracy: 0.9278
Iteration 12/20, Cost: 0.2057, Accuracy: 0.9304
Iteration 14/20, Cost: 0.2003, Accuracy: 0.9315
Iteration 16/20, Cost: 0.1960, Accuracy: 0.9322
Iteration 18/20, Cost: 0.1926, Accuracy: 0.932

                                                                                

Iteration 0/20, Cost: 0.7252, Accuracy: 0.6376
Iteration 2/20, Cost: 0.3354, Accuracy: 0.8697
Iteration 4/20, Cost: 0.2601, Accuracy: 0.9149
Iteration 6/20, Cost: 0.2322, Accuracy: 0.9243
Iteration 8/20, Cost: 0.2176, Accuracy: 0.9278
Iteration 10/20, Cost: 0.2084, Accuracy: 0.9301
Iteration 12/20, Cost: 0.2019, Accuracy: 0.9317
Iteration 14/20, Cost: 0.1971, Accuracy: 0.9330
Iteration 16/20, Cost: 0.1934, Accuracy: 0.9334
Iteration 18/20, Cost: 0.1904, Accuracy: 0.9327
Accuracy: 0.9330
Tiempo total experimento: 11.5055s
Iteration 0/20, Cost: 0.8198, Accuracy: 0.5859
Iteration 2/20, Cost: 0.3724, Accuracy: 0.8501
Iteration 4/20, Cost: 0.2798, Accuracy: 0.9030
Iteration 6/20, Cost: 0.2447, Accuracy: 0.9192
Iteration 8/20, Cost: 0.2262, Accuracy: 0.9239
Iteration 10/20, Cost: 0.2147, Accuracy: 0.9278
Iteration 12/20, Cost: 0.2067, Accuracy: 0.9305
Iteration 14/20, Cost: 0.2009, Accuracy: 0.9316
Iteration 16/20, Cost: 0.1964, Accuracy: 0.9322
Iteration 18/20, Cost: 0.1929, Accuracy: 0.932

                                                                                

Iteration 0/20, Cost: 0.5840, Accuracy: 0.6927
Iteration 2/20, Cost: 0.3287, Accuracy: 0.8775
Iteration 4/20, Cost: 0.2640, Accuracy: 0.9125
Iteration 6/20, Cost: 0.2364, Accuracy: 0.9217
Iteration 8/20, Cost: 0.2211, Accuracy: 0.9256
Iteration 10/20, Cost: 0.2113, Accuracy: 0.9290
Iteration 12/20, Cost: 0.2043, Accuracy: 0.9308
Iteration 14/20, Cost: 0.1991, Accuracy: 0.9324
Iteration 16/20, Cost: 0.1951, Accuracy: 0.9329
Iteration 18/20, Cost: 0.1918, Accuracy: 0.9328
Accuracy: 0.9328
Tiempo total experimento: 11.9351s
Iteration 0/20, Cost: 0.7103, Accuracy: 0.6399
Iteration 2/20, Cost: 0.3443, Accuracy: 0.8585
Iteration 4/20, Cost: 0.2643, Accuracy: 0.9092
Iteration 6/20, Cost: 0.2341, Accuracy: 0.9223
Iteration 8/20, Cost: 0.2183, Accuracy: 0.9279
Iteration 10/20, Cost: 0.2085, Accuracy: 0.9300
Iteration 12/20, Cost: 0.2018, Accuracy: 0.9325
Iteration 14/20, Cost: 0.1969, Accuracy: 0.9331
Iteration 16/20, Cost: 0.1931, Accuracy: 0.9330
Iteration 18/20, Cost: 0.1901, Accuracy: 0.933

                                                                                

Iteration 0/20, Cost: 0.7913, Accuracy: 0.6111
Iteration 2/20, Cost: 0.3789, Accuracy: 0.8433
Iteration 4/20, Cost: 0.2837, Accuracy: 0.9008
Iteration 6/20, Cost: 0.2474, Accuracy: 0.9178
Iteration 8/20, Cost: 0.2282, Accuracy: 0.9247
Iteration 10/20, Cost: 0.2163, Accuracy: 0.9275
Iteration 12/20, Cost: 0.2081, Accuracy: 0.9294
Iteration 14/20, Cost: 0.2021, Accuracy: 0.9309
Iteration 16/20, Cost: 0.1974, Accuracy: 0.9313
Iteration 18/20, Cost: 0.1938, Accuracy: 0.9315
Accuracy: 0.9314
Tiempo total experimento: 14.1785s


                                                                                

Iteration 0/20, Cost: 0.8396, Accuracy: 0.4936
Iteration 2/20, Cost: 0.3815, Accuracy: 0.8663
Iteration 4/20, Cost: 0.2889, Accuracy: 0.9043
Iteration 6/20, Cost: 0.2520, Accuracy: 0.9180
Iteration 8/20, Cost: 0.2319, Accuracy: 0.9235
Iteration 10/20, Cost: 0.2192, Accuracy: 0.9271
Iteration 12/20, Cost: 0.2104, Accuracy: 0.9289
Iteration 14/20, Cost: 0.2039, Accuracy: 0.9303
Iteration 16/20, Cost: 0.1990, Accuracy: 0.9308
Iteration 18/20, Cost: 0.1950, Accuracy: 0.9316
Accuracy: 0.9319
Tiempo total experimento: 13.8476s


                                                                                

Iteration 0/20, Cost: 0.6090, Accuracy: 0.6809
Iteration 2/20, Cost: 0.3172, Accuracy: 0.8888
Iteration 4/20, Cost: 0.2547, Accuracy: 0.9184
Iteration 6/20, Cost: 0.2294, Accuracy: 0.9265
Iteration 8/20, Cost: 0.2155, Accuracy: 0.9291
Iteration 10/20, Cost: 0.2067, Accuracy: 0.9308
Iteration 12/20, Cost: 0.2005, Accuracy: 0.9311
Iteration 14/20, Cost: 0.1959, Accuracy: 0.9319
Iteration 16/20, Cost: 0.1924, Accuracy: 0.9329
Iteration 18/20, Cost: 0.1895, Accuracy: 0.9331
Accuracy: 0.9332
Tiempo total experimento: 14.8233s


                                                                                

Iteration 0/20, Cost: 0.5920, Accuracy: 0.7017
Iteration 2/20, Cost: 0.3326, Accuracy: 0.8848
Iteration 4/20, Cost: 0.2677, Accuracy: 0.9128
Iteration 6/20, Cost: 0.2392, Accuracy: 0.9220
Iteration 8/20, Cost: 0.2231, Accuracy: 0.9271
Iteration 10/20, Cost: 0.2127, Accuracy: 0.9293
Iteration 12/20, Cost: 0.2055, Accuracy: 0.9309
Iteration 14/20, Cost: 0.2000, Accuracy: 0.9315
Iteration 16/20, Cost: 0.1958, Accuracy: 0.9322
Iteration 18/20, Cost: 0.1925, Accuracy: 0.9326
Accuracy: 0.9329
Tiempo total experimento: 15.0317s
Iteration 0/20, Cost: 0.6753, Accuracy: 0.6566
Iteration 2/20, Cost: 0.3282, Accuracy: 0.8877
Iteration 4/20, Cost: 0.2643, Accuracy: 0.9131
Iteration 6/20, Cost: 0.2381, Accuracy: 0.9221
Iteration 8/20, Cost: 0.2232, Accuracy: 0.9270
Iteration 10/20, Cost: 0.2133, Accuracy: 0.9293
Iteration 12/20, Cost: 0.2061, Accuracy: 0.9304
Iteration 14/20, Cost: 0.2007, Accuracy: 0.9311
Iteration 16/20, Cost: 0.1965, Accuracy: 0.9321
Iteration 18/20, Cost: 0.1931, Accuracy: 0.932

[{'read_time': 0.4146766662597656,
  'repartition_time': 1.659278154373169,
  'normalize_time': 0.6150708198547363,
  'train_time': 12.542762041091919,
  'acc_calc_time': 0.18268275260925293,
  'total_time': 15.414470434188843,
  'partitions': 1},
 {'read_time': 0.014286279678344727,
  'repartition_time': 0.8238177299499512,
  'normalize_time': 0.6419510841369629,
  'train_time': 9.961234092712402,
  'acc_calc_time': 0.1579580307006836,
  'total_time': 11.599247217178345,
  'partitions': 2},
 {'read_time': 0.012323617935180664,
  'repartition_time': 0.7555913925170898,
  'normalize_time': 0.5521674156188965,
  'train_time': 9.284482717514038,
  'acc_calc_time': 0.15058255195617676,
  'total_time': 10.755147695541382,
  'partitions': 3},
 {'read_time': 0.017593860626220703,
  'repartition_time': 0.9052779674530029,
  'normalize_time': 0.5424878597259521,
  'train_time': 9.900960206985474,
  'acc_calc_time': 0.1391911506652832,
  'total_time': 11.505511045455933,
  'partitions': 4},
 {'r