In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('cluster').getOrCreate()
df = spark.read.csv('hack_data.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [5]:
df.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [4]:
df.take(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)]

In [6]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed'], outputCol = 'features')
final_df = assembler.transform(df)
final_df.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)



In [9]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')
scaler_model = scaler.fit(final_df)
final_df = scaler_model.transform(final_df)
final_df.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [12]:
final_df.select('features','scaledFeatures').show(3)

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[8.0,391.09,1.0,2...|[0.56785108466505...|
|[20.0,720.99,0.0,...|[1.41962771166263...|
|[31.0,356.32,1.0,...|[2.20042295307707...|
+--------------------+--------------------+
only showing top 3 rows



In [13]:
final_df.take(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]), scaledFeatures=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963]))]

In [14]:
from pyspark.ml.clustering import KMeans

kmeans2 = KMeans(featuresCol = 'scaledFeatures', k=2)
kmeans3 = KMeans(featuresCol = 'scaledFeatures', k=3)

In [16]:
model_k2 = kmeans2.fit(final_df)
model_k3 = kmeans3.fit(final_df)

In [17]:
print('WSSSE_K2:', model_k2.computeCost(final_df))
print('WSSSE_K3:', model_k3.computeCost(final_df))

WSSSE_K2: 601.7707512676716
WSSSE_K3: 434.1492898715845


In [21]:
model_k2.transform(final_df).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [22]:
model_k3.transform(final_df).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   84|
|         0|   83|
+----------+-----+

