In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession 
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml.classification import LogisticRegression,DecisionTreeClassifier,RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

#from user_definition import *

sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

def toDoubleSafe(v):
    try:
        return float(v)
    except:
        return str(v) #if it is not a float type return as a string.

# def strip_time(x):
#     x = x.strip("\"")
#     try:
        
#         return datetime.strptime(x,'%Y-%m-%d %H:%M:%S')
#     except:
#         return None

## Create dataframe

In [2]:
#https://spark.apache.org/docs/latest/api/python/pyspark.ml.html?highlight=kmeans#pyspark.ml.clustering.KMeans

In [3]:
#Create a DataFrame
from pyspark.sql.types import *

penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

dfpen = ss.read.csv("../Data/penbased.dat", samplingRatio=0.3, schema=penschema)

In [4]:
dfpen.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| pix1| pix2|pix3| pix4| pix5| pix6| pix7| pix8| pix9|pix10|pix11|pix12|pix13|pix14|pix15|pix16|label|
+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 47.0|100.0|27.0| 81.0| 57.0| 37.0| 26.0|  0.0|  0.0| 23.0| 56.0| 53.0|100.0| 90.0| 40.0| 98.0|  8.0|
|  0.0| 89.0|27.0|100.0| 42.0| 75.0| 29.0| 45.0| 15.0| 15.0| 37.0|  0.0| 69.0|  2.0|100.0|  6.0|  2.0|
|  0.0| 57.0|31.0| 68.0| 72.0| 90.0|100.0|100.0| 76.0| 75.0| 50.0| 51.0| 28.0| 25.0| 16.0|  0.0|  1.0|
|  0.0|100.0| 7.0| 92.0|  5.0| 68.0| 19.0| 45.0| 86.0| 34.0|100.0| 45.0| 74.0| 23.0| 67.0|  0.0|  4.0|
|  0.0| 67.0|49.0| 83.0|100.0|100.0| 81.0| 80.0| 60.0| 60.0| 40.0| 40.0| 33.0| 20.0| 47.0|  0.0|  1.0|
|100.0|100.0|88.0| 99.0| 49.0| 74.0| 17.0| 47.0|  0.0| 16.0| 37.0|  0.0| 73.0| 16.0| 20.0| 20.0|  6.0|
|  0.0|100.0| 3.0| 72.0| 26.0| 35.0| 85.0| 35.0|100.0| 71.0| 73.0| 97.0| 

                                                                                

## Create dataframe with a feature vector (Exclude the label)

In [5]:
def Vector_Assembler(df,y_column):
    columns = df.columns
    # remove y column
    columns.remove(y_column)
    va= VectorAssembler(inputCols=columns,outputCol='features').transform(df)
    lpoints = va.select("features", y_column).withColumnRenamed(y_column, "label")
    return lpoints

In [27]:
lpoints = Vector_Assembler(dfpen,'label')
lpoints.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[47.0,100.0,27.0,...|  8.0|
|[0.0,89.0,27.0,10...|  2.0|
|[0.0,57.0,31.0,68...|  1.0|
|[0.0,100.0,7.0,92...|  4.0|
|[0.0,67.0,49.0,83...|  1.0|
+--------------------+-----+
only showing top 5 rows



# standard scale

In [26]:
from pyspark.ml.feature import StandardScaler
def normalize(train_df):
    scaler = StandardScaler(inputCol='features',outputCol='scaled').fit(train_df)
    scaled_df = scaler.transform(train_df)\
                        .drop('features')\
                        .withColumnRenamed('scaled','features')\
                        .select('features','label')
    return scaler,scaled_df

In [28]:
scaler, scaled_df = normalize(lpoints)

In [31]:
scaled_df.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.36818014358814...|  8.0|
|[0.0,5.5001493387...|  2.0|
|[0.0,3.5225675540...|  1.0|
|[0.0,6.1799430772...|  4.0|
|[0.0,4.1405618617...|  1.0|
|[2.91102158210244...|  6.0|
|[0.0,6.1799430772...|  4.0|
|[0.0,2.4101778001...|  0.0|
|[0.37843280567331...|  5.0|
|[2.15415597075580...|  9.0|
|[1.39729035940917...|  8.0|
|[2.91102158210244...|  5.0|
|[2.64902963971322...|  9.0|
|[0.0,5.2529516156...|  7.0|
|[1.01885755373585...|  3.0|
|[1.45551079105122...|  3.0|
|[2.88191136628141...|  9.0|
|[0.69864517970458...|  2.0|
|[0.0,4.5113584463...|  2.0|
|[0.34932258985229...|  5.0|
+--------------------+-----+
only showing top 20 rows



In [10]:
# from pyspark.ml.feature import VectorAssembler
# penlpoints= VectorAssembler(outputCol='features',inputCols=dfpen.columns[:-1]).transform(dfpen)

## Apply KMeans algorithm to the data frame

In [33]:
from pyspark.ml.clustering import KMeans

model = KMeans(k = 10,maxIter=200,tol =0.1)
kmodel = model.fit(scaled_df)

                                                                                

In [40]:
pred = kmodel.transform(scaled_df)
pred.show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[1.36818014358814...|  8.0|         5|
|[0.0,5.5001493387...|  2.0|         0|
|[0.0,3.5225675540...|  1.0|         3|
|[0.0,6.1799430772...|  4.0|         2|
|[0.0,4.1405618617...|  1.0|         3|
|[2.91102158210244...|  6.0|         9|
|[0.0,6.1799430772...|  4.0|         2|
|[0.0,2.4101778001...|  0.0|         1|
|[0.37843280567331...|  5.0|         8|
|[2.15415597075580...|  9.0|         7|
|[1.39729035940917...|  8.0|         5|
|[2.91102158210244...|  5.0|         7|
|[2.64902963971322...|  9.0|         7|
|[0.0,5.2529516156...|  7.0|         0|
|[1.01885755373585...|  3.0|         6|
|[1.45551079105122...|  3.0|         6|
|[2.88191136628141...|  9.0|         7|
|[0.69864517970458...|  2.0|         0|
|[0.0,4.5113584463...|  2.0|         0|
|[0.34932258985229...|  5.0|         8|
+--------------------+-----+----------+
only showing top 20 rows



In [6]:
from pyspark.ml.clustering import KMeans
kmodel = KMeans(k = 10,maxIter=200,tol =0.1).fit(penlpoints)

22/03/03 14:03:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [7]:
pred = kmodel.transform(penlpoints)

## Evaluation

In [39]:
centers = kmodel.clusterCenters()
centers

[array([0.38474181, 4.99547303, 1.76607907, 5.1194924 , 2.10075496,
        3.00980384, 1.8530205 , 1.52383537, 0.98279362, 0.51924834,
        0.42525131, 0.36022228, 1.97317494, 0.50155665, 2.30023528,
        0.51445069]),
 array([0.65639928, 4.85396223, 0.31194912, 2.2450629 , 0.77053946,
        0.21634928, 2.28969031, 0.31437175, 2.82425296, 1.70642175,
        2.2634699 , 3.18142842, 1.85456076, 2.84968096, 0.228292  ,
        1.80972175]),
 array([1.23896964e+00, 6.10340639e+00, 7.21006247e-01, 4.14915227e+00,
        1.12363019e-01, 1.92312032e+00, 1.53338797e+00, 1.44627262e+00,
        2.57507269e+00, 1.95451742e+00, 2.31805660e+00, 2.23981260e+00,
        3.19998655e+00, 9.45637265e-01, 1.49741955e+00, 2.93465250e-03]),
 array([0.1576717 , 3.58883883, 1.20773608, 3.68161291, 2.11677645,
        3.32845759, 2.84915717, 3.1863166 , 2.26307428, 2.72883316,
        1.91572589, 1.87351391, 2.5478725 , 0.72729471, 1.18475305,
        0.03736315]),
 array([2.36903138, 5.14655466, 

In [42]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()
silhouette= evaluator.evaluate(pred)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.4307650706198399


In [10]:
from pyspark.ml.evaluation import ClusteringEvaluator
ClusteringEvaluator().evaluate(pred)

0.4404754324875236

In [43]:
sc.stop()