In [1]:
import pyspark
from pyspark.sql import SparkSession
import os
import sys

In [2]:
os.environ['PYSPARK_PYTHON']=sys.executable
os.environ['PYSPARK_DRIVER_PYTHON']=sys.executable

spark=SparkSession.builder.appName('KMeans').getOrCreate()

In [3]:
path=r"C:\Users\LENOVO\Desktop\SUB 6TH SEM\BDA\LAB\kddcup.data_10_percent"

data_without_header=spark.read.csv(path,inferSchema=True,header=False)

column_names = [ "duration", "protocol_type", "service", "flag",
"src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
"hot", "num_failed_logins", "logged_in", "num_compromised",
"root_shell", "su_attempted", "num_root", "num_file_creations",
"num_shells", "num_access_files", "num_outbound_cmds",
"is_host_login", "is_guest_login", "count", "srv_count",
"serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
"same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
"dst_host_count", "dst_host_srv_count",
"dst_host_same_srv_rate", "dst_host_diff_srv_rate",
"dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
"dst_host_serror_rate", "dst_host_srv_serror_rate",
"dst_host_rerror_rate", "dst_host_srv_rerror_rate",
"label"]

data=data_without_header.toDF(*column_names)

In [4]:
from pyspark.sql.functions import col
data.select(col('label')).groupBy(col('label')).count().orderBy(col('count').desc()).show()

+----------------+------+
|           label| count|
+----------------+------+
|          smurf.|280790|
|        neptune.|107201|
|         normal.| 97278|
|           back.|  2203|
|          satan.|  1589|
|        ipsweep.|  1247|
|      portsweep.|  1040|
|    warezclient.|  1020|
|       teardrop.|   979|
|            pod.|   264|
|           nmap.|   231|
|   guess_passwd.|    53|
|buffer_overflow.|    30|
|           land.|    21|
|    warezmaster.|    20|
|           imap.|    12|
|        rootkit.|    10|
|     loadmodule.|     9|
|      ftp_write.|     8|
|       multihop.|     7|
+----------------+------+
only showing top 20 rows



In [47]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml import Pipeline
from pprint import pprint

numeric_data=data.drop('service','flag','protocol_type')
assembler=VectorAssembler().setInputCols(numeric_data.columns[:-1]).setOutputCol('featureVector')
kmeans=KMeans().setPredictionCol('Cluster').setFeaturesCol('featureVector')
pipeline=Pipeline().setStages([assembler,kmeans])
pipeline_model=pipeline.fit(numeric_data)
kmeans_model=pipeline_model.stages[1]
pprint(kmeans_model.clusterCenters())

[array([4.79793956e+01, 1.62207883e+03, 8.68534183e+02, 4.45326100e-05,
       6.43293794e-03, 1.41694668e-05, 3.45168212e-02, 1.51815716e-04,
       1.48247035e-01, 1.02121372e-02, 1.11331525e-04, 3.64357718e-05,
       1.13517671e-02, 1.08295211e-03, 1.09307315e-04, 1.00805635e-03,
       0.00000000e+00, 0.00000000e+00, 1.38658354e-03, 3.32286248e+02,
       2.92907143e+02, 1.76685418e-01, 1.76607809e-01, 5.74330999e-02,
       5.77183920e-02, 7.91548844e-01, 2.09816404e-02, 2.89968625e-02,
       2.32470732e+02, 1.88666046e+02, 7.53781203e-01, 3.09056111e-02,
       6.01935529e-01, 6.68351484e-03, 1.76753957e-01, 1.76441622e-01,
       5.81176268e-02, 5.74111170e-02]),
 array([2.0000000e+00, 6.9337564e+08, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00

In [6]:
clustered_data=pipeline_model.transform(numeric_data)
clustered_data.select('label','Cluster').groupBy('label','Cluster').count().show()

+----------------+-------+------+
|           label|Cluster| count|
+----------------+-------+------+
|        neptune.|      0|107201|
|         normal.|      0| 97278|
|        ipsweep.|      0|  1247|
|     loadmodule.|      0|     9|
|       teardrop.|      0|   979|
|          smurf.|      0|280790|
|      portsweep.|      0|  1039|
|            pod.|      0|   264|
|buffer_overflow.|      0|    30|
|   guess_passwd.|      0|    53|
|           perl.|      0|     3|
|           land.|      0|    21|
|          satan.|      0|  1589|
|      portsweep.|      1|     1|
|      ftp_write.|      0|     8|
|            phf.|      0|     4|
|           back.|      0|  2203|
|           nmap.|      0|   231|
|           imap.|      0|    12|
|       multihop.|      0|     7|
+----------------+-------+------+
only showing top 20 rows



Choosing K

In [12]:
def clustering_score(data,k):
    numeric_data=data.drop('service','flag','protocol_type')
    assembler=VectorAssembler().setInputCols(numeric_data.columns[:-1]).setOutputCol('featureVector')
    scalar=StandaradScalar().setInputCol('featureVector').setOutputCol('standard')
    kmeans=KMeans().setK(k).setPredictionCol('Cluster').setFeaturesCol('featureVector')
    pipeline=Pipeline().setStages([assembler,kmeans])
    pipeline_model=pipeline.fit(numeric_data)
    kmeans_model=pipeline_model.stages[-1]
    training_cost=kmeans_model.summary.trainingCost
    return training_cost
    
for k in list(range(20,101, 20)):
    print(k, clustering_score(data, k))
    
    

20 34615925837028.062
40 9306707158393.582
60 7972928544231.26
80 7970651131204.402
100 3563548500375.7725


In [None]:
#Feature Normalisation

In [17]:
from pyspark.ml.feature import StandardScaler
def clustering_score(data,k):
    numeric_data=data.drop('service','flag','protocol_type')
    assembler=VectorAssembler().setInputCols(numeric_data.columns[:-1]).setOutputCol('featureVector')
    scalar=StandardScaler().setInputCol('featureVector').setOutputCol('standardFeatureVector').setWithStd(True).setWithMean(False)
    kmeans=KMeans().setK(k).setPredictionCol('Cluster').setFeaturesCol('standardFeatureVector')
    pipeline=Pipeline().setStages([assembler,scalar,kmeans])
    pipeline_model=pipeline.fit(numeric_data)
    kmeans_model=pipeline_model.stages[-1]
    training_cost=kmeans_model.summary.trainingCost
    return training_cost
    
for k in list(range(20,101, 20)):
    print(k, clustering_score(data, k))

20 2791501.228517172
40 1067870.7713538604
60 582951.1564411405
80 395232.1962467235
100 312257.92762681295


In [None]:
#Categorical Variables

In [41]:
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler
from random import randint

def one_hot_pipeline(input_col):
    indexer = StringIndexer(inputCol=input_col, outputCol=input_col + "_indexed")
    encoder = OneHotEncoder(inputCol=input_col + "_indexed", outputCol=input_col + "_vec")
    pipeline = Pipeline(stages=[indexer, encoder])
    return pipeline, input_col + "_vec"

def clustering_score_3(input_data, k):
    proto_type_pipeline, proto_type_vec_col = one_hot_pipeline("protocol_type")
    service_pipeline, service_vec_col = one_hot_pipeline("service")
    flag_pipeline, flag_vec_col = one_hot_pipeline("flag")
    
    assemble_cols = set(input_data.columns) - {"label", "protocol_type", "service", "flag"} | \
                    {proto_type_vec_col, service_vec_col, flag_vec_col}
    
    assembler = VectorAssembler(inputCols=list(assemble_cols), outputCol="featureVector")
    scaler = StandardScaler(inputCol="featureVector", outputCol="scaledFeatureVector", 
                            withStd=True, withMean=False)
    
    kmeans = KMeans().setSeed(randint(100, 100000)).setK(k).setMaxIter(40).setTol(1.0e-5).\
             setPredictionCol("cluster").setFeaturesCol("scaledFeatureVector")
    
    pipeline = Pipeline(stages=[proto_type_pipeline, service_pipeline, flag_pipeline, 
                                assembler, scaler, kmeans])
    
    pipeline_model = pipeline.fit(input_data)
    kmeans_model = pipeline_model.stages[-1]
    training_cost = kmeans_model.summary.trainingCost
    
    return training_cost

for k in range(60, 271, 30):
    print(k, clustering_score_3(data, k))


60 16709099.999693863
90 7143825.712558743
120 1708663.428507424
150 1028144.2874173195
180 705246.1290458554
210 620445.1911539775
240 493977.1866159912
270 397326.11690454604


In [None]:
#using_labels with entropy

In [43]:
from math import log
from pyspark.sql import functions as F
from pyspark.sql import Window

def entropy(counts):
    values = [c for c in counts if c > 0]
    n = sum(values)
    p = [v / n for v in values]
    return sum([-1 * (p_v) * log(p_v) for p_v in p])

cluster_label = pipeline_model.transform(data).select("cluster", "label")
df = cluster_label.groupBy("cluster", "label").count().orderBy("cluster")
w = Window.partitionBy("cluster")
p_col = df['count'] / F.sum(df['count']).over(w)
with_p_col = df.withColumn("p_col", p_col)
result = with_p_col.groupBy("cluster").agg(
    (-F.sum(F.col("p_col") * F.log2(F.col("p_col")))).alias("entropy"),
    F.sum(F.col("count")).alias("cluster_size")
)
result = result.withColumn('weightedClusterEntropy', F.col('entropy') * F.col('cluster_size'))
weighted_cluster_entropy_avg = result.agg(F.sum(F.col('weightedClusterEntropy'))).collect()
weighted_cluster_entropy_avg[0][0] / data.count()


1.557605039016584

In [44]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.sql import Window

def fit_pipeline_4(data, k):
    (proto_type_pipeline, proto_type_vec_col) = one_hot_pipeline("protocol_type")
    (service_pipeline, service_vec_col) = one_hot_pipeline("service")
    (flag_pipeline, flag_vec_col) = one_hot_pipeline("flag")
    
    assemble_cols = set(data.columns) - {"label", "protocol_type", "service", "flag"} | {proto_type_vec_col, service_vec_col, flag_vec_col}
    assembler = VectorAssembler(inputCols=list(assemble_cols), outputCol="featureVector")
    scaler = StandardScaler(inputCol="featureVector", outputCol="scaledFeatureVector", withStd=True, withMean=False)
    kmeans = KMeans(seed=randint(100, 100000), k=k, predictionCol="cluster", featuresCol="scaledFeatureVector", maxIter=40, tol=1.0e-5)
    
    pipeline = Pipeline(stages=[proto_type_pipeline, service_pipeline, flag_pipeline, assembler, scaler, kmeans])
    return pipeline.fit(data)

def clustering_score_4(input_data, k):
    pipeline_model = fit_pipeline_4(input_data, k)
    cluster_label = pipeline_model.transform(input_data).select("cluster", "label")
    df = cluster_label.groupBy("cluster", "label").count().orderBy("cluster")
    w = Window.partitionBy("cluster")
    p_col = df['count'] / F.sum(df['count']).over(w)
    with_p_col = df.withColumn("p_col", p_col)
    result = with_p_col.groupBy("cluster").agg(-F.sum(F.col("p_col") * F.log2(F.col("p_col"))).alias("entropy"), F.sum(F.col("count")).alias("cluster_size"))
    result = result.withColumn('weightedClusterEntropy', F.col('entropy') * F.col('cluster_size'))
    weighted_cluster_entropy_avg = result.agg(F.sum(F.col('weightedClusterEntropy'))).collect()
    return weighted_cluster_entropy_avg[0][0] / input_data.count()


In [45]:
pipeline_model = fit_pipeline_4(data, 180)
count_by_cluster_label = pipeline_model.transform(data).select("cluster", "label").groupBy("cluster", "label").count().orderBy("cluster", "label")
count_by_cluster_label.show()


+-------+----------+------+
|cluster|     label| count|
+-------+----------+------+
|      0|  neptune.| 45462|
|      1|  ipsweep.|     4|
|      1|     nmap.|     1|
|      1|   normal.|   337|
|      1|portsweep.|     1|
|      1|    smurf.|280787|
|      2|  neptune.|    99|
|      3|  neptune.|    96|
|      3|     nmap.|     1|
|      4|  neptune.|     4|
|      4|   normal.|  4071|
|      5|  ipsweep.|     2|
|      5|  neptune.|   105|
|      6|  ipsweep.|     1|
|      6|  neptune.|   161|
|      6|   normal.|   464|
|      6|    satan.|     3|
|      7|  neptune.|    91|
|      7|portsweep.|     1|
|      8|   normal.|   866|
+-------+----------+------+
only showing top 20 rows

