In [1]:
import findspark
import os
os.environ['PATH'].split(';')


findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans


spark = SparkSession.builder.appName("midterm").getOrCreate()

df = spark.read .format("csv").option("header", "true").load("drive_stats_2019_Q1/*.csv")
df_temp = df.select("smart_1_normalized")

print("All columns in the dataframe")
print(df_temp.columns)
feat_cols = [ 'smart_1_normalized']

for column in feat_cols:
    df_temp = df_temp.withColumn(column,df_temp[column].cast(IntegerType()))


# Run the K-means on all the datapoints of the dataframe on column smart attribute smart_normalized_1.
# For each point:
# Predict the cluster they belong to. {0,1,2 ....}
# Calculate the distance between the point and the centroid of that cluster.
# Based on a given threshold, flag outliers.

df_temp.dropna()
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')
final_data = vec_assembler.setHandleInvalid("skip").transform(df_temp)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(final_data)
cluster_final_data = scalerModel.transform(final_data)

# K>5 did show improvement in clustering

kmeans2 = KMeans(featuresCol='scaledFeatures',k=5)
model_k2 = kmeans2.fit(cluster_final_data)

print("All centroids")
centers = model_k2.clusterCenters()
centers = [center.tolist() for center in centers]
print(centers)

All columns in the dataframe
['smart_1_normalized']
All centroids
[[4.2730568163083795], [6.726588196069323], [5.816484933988873], [4.722193215367085], [11.588285637607676]]


In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from scipy.spatial import distance

df_preds2 = model_k2.transform(cluster_final_data).orderBy("prediction")

distance_udf = F.udf(lambda x,y: float(distance.euclidean(x, centers[y])), FloatType())
df_preds2 = df_preds2.withColumn('distance', distance_udf(F.col('scaledFeatures'),F.col('prediction')))
df_preds2 = df_preds2.orderBy("prediction").orderBy(F.desc("distance"))
# df_preds2.show()
# q = df_preds2.approxQuantile("distance", [0.5], 1)
# q

from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

window = Window.partitionBy(df_preds2['prediction']).orderBy(df_preds2['distance'].desc())

threshold = 10
print("Outliers for clusters")
df_preds2.select('smart_1_normalized','prediction','distance', rank().over(window).alias('rank')) .filter(F.col('rank') <= threshold) .show(50) 

Outliers for clusters
+------------------+----------+---------+----+
|smart_1_normalized|prediction| distance|rank|
+------------------+----------+---------+----+
|               153|         1|2.1384923|   1|
|               149|         1|1.9067256|   2|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.4109425|   3|
|               109|         1|0.41094