In [4]:
import findspark
from pyspark.sql import SparkSession
from functions.required_functions import readFile, normalize, transform, get_block_data, train, accuracy
import numpy as np
import os


In [5]:
findspark.init()

# CPUs disponibles para Docker (respeta limits)
num_cores = os.cpu_count()

print(f"CPUs disponibles en el contenedor: {num_cores}")

spark = (
    SparkSession.builder
    .master(f"local[{num_cores}]")
    .appName("Exercise_2")
    .getOrCreate()
)

sc = spark.sparkContext

CPUs disponibles en el contenedor: 12


26/02/21 12:34:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### New functions

In [6]:
path = "data/botnet_reduced_10k_l.csv"  # botnet_tot_syn_l.csv // botnet_reduced_10k_l.csv
nIter = 20
leaning_rate =  1.5
lambda_reg = 0

# Read data
data = readFile(path, sc).cache()
print(f"Total samples: {data.count()}")

# Standardize
data = normalize(data)
print("Data normalized.")

sums, n = data.map(lambda x: (x[0], 1)).reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))
medias = sums / n
print(f"Feature means after normalization: {np.array2string(medias, precision=4, suppress_small=True)}")
desvs = (data.map(lambda x: (x[0]-medias)**2).reduce(lambda x, y: x + y) / (n-1))**0.5
print(f"Feature stds after normalization: {np.array2string(desvs, precision=4, suppress_small=True)}")

# Shuffle rows and transform data, specifying the number of blocks
num_blocks_cv = 5
data_cv = transform(data, num_blocks_cv).cache()
print(f"Data transformed for {num_blocks_cv}-fold CV.")

avg_acc = []
for i in range(num_blocks_cv):
    train_data, test_data = get_block_data(data_cv, i)
    w_b = train(train_data, nIter, leaning_rate, lambda_reg, show_logs=False)
    acc = accuracy(w_b[0], w_b[1], test_data)
    avg_acc.append(acc)
    print(f"  Block {i+1}, Accuracy: {acc:.4f}")

print(f"Average CV Accuracy: {np.mean(avg_acc):.4f}±{np.std(avg_acc):.4f}")

Total samples: 10000
Data normalized.
Feature means after normalization: [ 0.  0. -0.  0. -0. -0.  0. -0. -0.  0. -0.]
Feature stds after normalization: [1.0001 1.0001 1.0001 1.0001 1.0001 1.0001 1.0001 1.0001 1.0001 1.0001
 1.0001]
Data transformed for 5-fold CV.
  Block 1, Accuracy: 0.9312
  Block 2, Accuracy: 0.9315
  Block 3, Accuracy: 0.9225
  Block 4, Accuracy: 0.9444
  Block 5, Accuracy: 0.9317
Average CV Accuracy: 0.9323±0.0070
