## The task

Cluster the given data (hacked data) to see if there are 2 hackers involved or 3

## Import

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("consult_cluster").getOrCreate()

## Load data

In [2]:
import os
os.listdir()

['.ipynb_checkpoints',
 'Clustering Code Along.ipynb',
 'Clustering_Code_Example.ipynb',
 'Clustering_Consulting_Project.ipynb',
 'Clustering_Consulting_Project_SOLUTIONS.ipynb',
 'hack_data.csv',
 'My_code_along.ipynb',
 'My_consult_proj.ipynb',
 'My_doc_example.ipynb',
 'sample_kmeans_data.txt',
 'seeds_dataset.csv',
 'seeds_dataset.txt']

In [3]:
df = spark.read.csv("hack_data.csv", inferSchema=True, header=True)

In [4]:
print(df.count(), len(df.columns))

334 7


In [5]:
df.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



- Location should not be a good source of data, since the hackers likely use VPN
- Important: each hacker made roughly the same amount of attacks. If we have in total 100 hacks, in scenario of 2 hackers, each one make 50 hacks. In the scenario of 3 hackes, each one made about 33 hacks

In [7]:
df.show(5)

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

## Prepare data

In [8]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [9]:
df.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

### Create vector feature

In [11]:
VecAss = VectorAssembler(inputCols=['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed'], outputCol="features")

In [12]:
dt = VecAss.transform(df)

In [13]:
dt.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)



In [14]:
dt.show(5)

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|            features|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|[8.0,391.09,1.0,2...|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|[20.0,720.99,0.0,...|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|[31.0,356.32,1.0,...|
|                    2.0|           228.08|              1|             2.48|     

### Scale the features

In [15]:
scl = StandardScaler(inputCol="features", outputCol="scaled")

In [16]:
data = scl.fit(dt).transform(dt)

In [17]:
final_data = data.select("scaled")

In [19]:
final_data.show(5)

+--------------------+
|              scaled|
+--------------------+
|[0.56785108466505...|
|[1.41962771166263...|
|[2.20042295307707...|
|[0.14196277116626...|
|[1.41962771166263...|
+--------------------+
only showing top 5 rows



## Training

In [31]:
from pyspark.ml.clustering import KMeans

#km2 = KMeans(featuresCol="scaled").setK(2)
#km3 = KMeans(featuresCol="scaled").setK(3)

km2 = KMeans(featuresCol="scaled", k=2)
km3 = KMeans(featuresCol="scaled", k=3)

In [32]:
model2 = km2.fit(final_data)
model3 = km3.fit(final_data)

## Prediction

In [33]:
pred2 = model2.transform(final_data)

In [34]:
pred2.show(5)

+--------------------+----------+
|              scaled|prediction|
+--------------------+----------+
|[0.56785108466505...|         1|
|[1.41962771166263...|         1|
|[2.20042295307707...|         1|
|[0.14196277116626...|         1|
|[1.41962771166263...|         1|
+--------------------+----------+
only showing top 5 rows



In [35]:
pred2.groupby("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [36]:
pred3 = model3.transform(final_data)

In [37]:
pred3.show(5)

+--------------------+----------+
|              scaled|prediction|
+--------------------+----------+
|[0.56785108466505...|         1|
|[1.41962771166263...|         2|
|[2.20042295307707...|         1|
|[0.14196277116626...|         1|
|[1.41962771166263...|         2|
+--------------------+----------+
only showing top 5 rows



In [38]:
pred3.groupby("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   83|
|         2|   84|
|         0|  167|
+----------+-----+



From the group by and count it is clear that it was 2 hackers