In [1]:
import numpy as np
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NormalizacionDataset").getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/18 18:06:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
rdd = sc.textFile("data/botnet_reduced_10k_l.csv")  # The full file is: botnet_tot_syn_l.csv
TOTAL = rdd.count()
rdd.take(5)

                                                                                

['3.545301891684014663e+03,3.198013946952254628e+03,8.000015469454228878e+01,1.000001920508630349e+00,4.449600864643011093e+08,4.768073064086365775e+02,1.299999999293617492e+01,-7.710424743123667213e-09,8.700000148917338549e+01,1.909943015956687927e+07,2.468368394759351254e+09 ,0',
 '-1.213087614360119915e-06,7.999989079213264631e+00,6.549999887793754897e+04,9.999997452701503420e-01,6.199988768910407089e+01,6.999980788311222568e+01,1.199999999590421140e+01,-2.356881878551320142e-08,8.000004983404750192e+00,2.468369617006896019e+09,2.468368392737727642e+09 ,1',
 '3.599999088409939759e+03,5.494306011685771227e+04,5.300026021374378615e+01,1.000001920508630349e+00,2.629992418504625675e+02,8.299988759205825772e+01,1.299999999293617492e+01,5.000000032871429134e+00,2.129999991647522961e+02,1.590216587146015644e+09,9.830836248397827148e+01 ,0',
 '1.327001500454281313e+01,1.359530053116250201e+03,5.497696727996619302e+04,1.000001920508630349e+00,6.415608124383258820e+08,8.299988759205825772e+01

In [4]:
def readFile(filename:str):
    rdd = sc.textFile(filename)
    return (
        rdd.map(lambda x: x.split(","))
        .map(lambda x: tuple(map(float, x)))
        .map(lambda x: (x[:-1], int(x[-1])))
    )

RDD_Xy = readFile("data/botnet_reduced_10k_l.csv")
RDD_Xy.take(2)

[((3545.3018916840147,
   3198.0139469522546,
   80.00015469454229,
   1.0000019205086303,
   444960086.4643011,
   476.8073064086366,
   12.999999992936175,
   -7.710424743123667e-09,
   87.00000148917339,
   19099430.15956688,
   2468368394.7593513),
  0),
 ((-1.21308761436012e-06,
   7.999989079213265,
   65499.99887793755,
   0.9999997452701503,
   61.99988768910407,
   69.99980788311223,
   11.999999995904211,
   -2.35688187855132e-08,
   8.00000498340475,
   2468369617.006896,
   2468368392.7377276),
  1)]

In [5]:
def normalize(RDD_Xy):
    rdd_X = RDD_Xy.map(lambda x: x[0])
    _ = RDD_Xy.map(lambda x: x[1])

    mu = rdd_X.map(lambda x: np.array(x)).reduce(lambda x, y: x + y) / TOTAL
    std = (rdd_X.map(lambda x: (np.array(x) - mu) ** 2).reduce(lambda x, y: x + y) / TOTAL) ** 0.5

    return RDD_Xy.map(lambda x: ((np.array(x[0]) - mu) / std, x[1]))

normalize(RDD_Xy).take(2)

[(array([ 1.41281668, -0.75956071, -0.41204268, -0.45940748,  1.39580934,
         -0.35386218,  0.74057161, -0.8945781 , -0.40163083, -2.91039199,
          0.1567098 ]),
  0),
 (array([-0.79867306, -0.89146231,  3.64034447, -0.45940748, -0.52296869,
         -0.35387136,  0.55184122, -0.8945781 , -1.27444226,  0.4762026 ,
          0.1567098 ]),
  1)]

In [None]:
def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    its_to_print = list(range(0, iterations, iterations // 10)) + [iterations]
    k = len(RDD_Xy.first()[0])
    w = np.random.rand(k)
    b = 0

    for it in range(iterations):
        # Gradients
        dw =  RDD_Xy.map(lambda x: (predict_proba(w, b, x[0]) - x[1])*np.array(x[0])).reduce(lambda x, y: x + y) / TOTAL
        db = RDD_Xy.map(lambda x: predict_proba(w, b, x[0]) - x[1]).reduce(lambda x, y: x + y) / TOTAL

        # Regularization
        dw += lambda_reg / k * w

        # Update parameters
        w -= learning_rate * dw
        b -= learning_rate * db
        
        # Logging
        if it in its_to_print:
            acc = accuracy(w, b, RDD_Xy)
            print(f"Iteration {it}/{iterations}, Accuracy: {acc:.4f}")

    return w, b

def sigmoid(logit):
    return 1 / (1 + np.exp(-logit))

def predict_proba(w, b, X):
    X = np.array(X)
    w = np.array(w)
    return sigmoid(np.dot(w, X) + b)

def predict(w, b, X):
    return 1 if predict_proba(w, b, X) > 0.5 else 0

def accuracy(w, b, RDD_Xy):
    preds = RDD_Xy.map(lambda x: (int(predict(w, b, x[0]) == x[1]), 1)).reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))
    return preds[0] / preds[1]

In [19]:
path = "data/botnet_tot_syn_l.csv"  # botnet_tot_syn_l.csv // botnet_reduced_10k_l.csv
nIter = 20
leaning_rate =  1
lambda_reg = 0.2

# Read data
data = readFile(path)

# Standardize
data = normalize(data)

w_b = train(data, nIter, leaning_rate, lambda_reg)
acc = accuracy(w_b[0], w_b[1], data)
print(f"Accuracy: {acc:.4f}")

                                                                                

Iteration 0/20, Accuracy: 0.5000


                                                                                

Iteration 2/20, Accuracy: 0.5000


                                                                                

Iteration 4/20, Accuracy: 0.5000


                                                                                

Iteration 6/20, Accuracy: 0.5000


                                                                                

Iteration 8/20, Accuracy: 0.5000


                                                                                

Iteration 10/20, Accuracy: 0.5000


                                                                                

Iteration 12/20, Accuracy: 0.5000


                                                                                

Iteration 14/20, Accuracy: 0.5000


                                                                                

Iteration 16/20, Accuracy: 0.5000


                                                                                

Iteration 18/20, Accuracy: 0.5000




Accuracy: 0.5000


                                                                                