In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [None]:
spark = SparkSession.builder.appName("Q2").getOrCreate()

In [None]:
data = spark.read.csv("/content/Online Retail.csv", header=True, inferSchema=True)
data.show(5)

+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/01/2010 08:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/01/2010 08:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/01/2010 08:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/01/2010 08:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/01/2010 08:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
only showing top 5 rows



In [None]:
data = data.dropna(subset=["Quantity", "UnitPrice", "CustomerID"])

# Filter out non-positive values (optional but common)
data = data.filter((data.Quantity > 0) & (data.UnitPrice > 0))

In [None]:
assembler = VectorAssembler(inputCols=["Quantity", "UnitPrice"], outputCol="features")
assembled = assembler.transform(data)

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scaled_model = scaler.fit(assembled)
final_data = scaled_model.transform(assembled)

In [None]:
kmeans = KMeans(featuresCol='scaledFeatures', k=4, seed=1)
model = kmeans.fit(final_data)

In [None]:
predictions = model.transform(final_data)
predictions.select("CustomerID", "Quantity", "UnitPrice", "prediction").show(10)

+----------+--------+---------+----------+
|CustomerID|Quantity|UnitPrice|prediction|
+----------+--------+---------+----------+
|     17850|       6|     2.55|         0|
|     17850|       6|     3.39|         0|
|     17850|       8|     2.75|         0|
|     17850|       6|     3.39|         0|
|     17850|       6|     3.39|         0|
|     17850|       2|     7.65|         3|
|     17850|       6|     4.25|         0|
|     17850|       6|     1.85|         0|
|     17850|       6|     1.85|         0|
|     13047|      32|     1.69|         0|
+----------+--------+---------+----------+
only showing top 10 rows



In [None]:
evaluator = ClusteringEvaluator(featuresCol='scaledFeatures', metricName='silhouette', distanceMeasure='squaredEuclidean')
score = evaluator.evaluate(predictions)

print(f"Silhouette Score = {score:.3f}")

centers = model.clusterCenters()
print("Cluster Centers:")
for i, center in enumerate(centers):
    print(f"Cluster {i}: {center}")

Silhouette Score = 0.678
Cluster Centers:
Cluster 0: [0.29898707 0.38062989]
Cluster 1: [2.72598419e-02 2.92902030e+01]
Cluster 2: [12.16879344  0.22828607]
Cluster 3: [0.08580147 1.54068214]
