In [2]:
import findspark, pyspark
findspark.find()

'C:\\Bigdata\\spark-2.4.5-bin-hadoop2.7'

In [3]:
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession 
spark = SparkSession.builder.master('local').appName('cluster').getOrCreate()
dataset = spark.read.csv("hack_data.csv", inferSchema=True, header=True)
dataset.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [4]:
dataset.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
dataset.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [6]:
assembler = VectorAssembler(inputCols=[  
     'Session_Connection_Time',
     'Bytes Transferred',
     'Kali_Trace_Used',
     'Servers_Corrupted',
     'Pages_Corrupted',
     'WPM_Typing_Speed'  ], outputCol='features')
final_data = assembler.transform(dataset)
final_data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed',
 'features']

In [11]:
from pyspark.ml.feature import StandardScaler

# withStd표준편차
scaler      = StandardScaler( inputCol='features', 
                              outputCol='scaledFeatures',
                              withStd=True, withMean=False )
scalerModel = scaler.fit(final_data)
cluster_final_data  = scalerModel.transform(final_data)
cluster_final_data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed',
 'features',
 'scaledFeatures']

In [14]:
from pyspark.ml.clustering import KMeans

kmeans3 = KMeans(featuresCol='scaledFeatures', k=3)
kmeans2 = KMeans(featuresCol='scaledFeatures', k=2)

model_k3 = kmeans3.fit(cluster_final_data)
model_k2 = kmeans2.fit(cluster_final_data)

wssse_k3 = model_k3.computeCost(cluster_final_data)
wssse_k2 = model_k2.computeCost(cluster_final_data)

In [18]:
print('with k=3')
print('within Set Sum of Squared Errors = ' +str(wssse_k3))
print('--' * 30)
print('with k=2')
print('within Set Sum of Squared Errors = ' +str(wssse_k2))

with k=3
within Set Sum of Squared Errors = 434.1492898715845
------------------------------------------------------------
with k=2
within Set Sum of Squared Errors = 601.7707512676716


## k값에 따른 값 비교 반복문

In [17]:
for k in range(2,9):
    kmeans = KMeans(featuresCol='scaledFeatures', k=k)
    model = kmeans.fit(cluster_final_data)
    wssse = model.computeCost(cluster_final_data)
    print('with K={}'.format(k))
    print('within Set Sum of Squared Errors =' + str(wssse))
    print('--' * 30)

with K=2
within Set Sum of Squared Errors =601.7707512676716
------------------------------------------------------------
with K=3
within Set Sum of Squared Errors =434.1492898715845
------------------------------------------------------------
with K=4
within Set Sum of Squared Errors =412.3798934802814
------------------------------------------------------------
with K=5
within Set Sum of Squared Errors =245.36421529748606
------------------------------------------------------------
with K=6
within Set Sum of Squared Errors =224.8247325683356
------------------------------------------------------------
with K=7
within Set Sum of Squared Errors =220.95078930544886
------------------------------------------------------------
with K=8
within Set Sum of Squared Errors =204.71681711245506
------------------------------------------------------------


In [19]:
model_k3.transform(cluster_final_data).groupby('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   83|
|         0|   84|
+----------+-----+

