# 1. Handling Missing Values and Scaling Numerical Features

In [21]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.feature import Imputer

# Create Spark session
spark = SparkSession.builder.appName("DataPreprocessing").getOrCreate()

# Load data
data = spark.read.csv("data1.csv", header=True, inferSchema=True)

# Handle missing values using Imputer
imputer = Imputer(inputCols=data.columns, outputCols=[f"{col}_imputed" for col in data.columns])
data_imputed = imputer.fit(data).transform(data)

# Assemble features into a vector
feature_cols = [f"{col}_imputed" for col in data.columns]  # Use imputed columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data_vector = assembler.transform(data_imputed)

# Scale numerical features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
scaler_model = scaler.fit(data_vector)
data_scaled = scaler_model.transform(data_vector)

# Show the result
data_scaled.select("scaled_features").show()


+--------------------+
|     scaled_features|
+--------------------+
|[-1.4605934866804...|
|[-1.0954451150103...|
|[-0.7302967433402...|
|[-0.3651483716701...|
|[0.0,-1.286034203...|
|[0.36514837167011...|
|[0.73029674334022...|
|[1.09544511501033...|
|[1.46059348668044...|
+--------------------+



# 2. K-means Clustering

In [13]:
from pyspark.ml.clustering import KMeans

# Set the number of clusters
num_clusters = 3

# Train K-means model
kmeans = KMeans(featuresCol="scaled_features", k=num_clusters)
model = kmeans.fit(data_scaled)

# Make predictions
predictions = model.transform(data_scaled)

# Show cluster assignments
predictions.select("scaled_features", "prediction").show()


+--------------------+----------+
|     scaled_features|prediction|
+--------------------+----------+
|[-1.4605934866804...|         1|
|[-1.0954451150103...|         1|
|[-0.7302967433402...|         2|
|[-0.3651483716701...|         2|
|[0.0,-1.286034203...|         1|
|[0.36514837167011...|         2|
|[0.73029674334022...|         0|
|[1.09544511501033...|         0|
|[1.46059348668044...|         0|
+--------------------+----------+



# 3. Labeling Data Points as Anomalies Based on Cluster Assignments

In [14]:
from pyspark.sql import functions as F

# Assuming a point is an anomaly if it is not assigned to the most populous cluster
# Get the count of points in each cluster
cluster_counts = predictions.groupBy("prediction").count().orderBy(F.desc("count"))

# Get the most populous cluster
most_populous_cluster = cluster_counts.first()["prediction"]

# Label anomalies (1 for anomaly, 0 for normal)
predictions = predictions.withColumn("is_anomaly", F.when(predictions.prediction != most_populous_cluster, 1).otherwise(0))

# Show results
predictions.select("scaled_features", "prediction", "is_anomaly").show()


+--------------------+----------+----------+
|     scaled_features|prediction|is_anomaly|
+--------------------+----------+----------+
|[-1.4605934866804...|         1|         0|
|[-1.0954451150103...|         1|         0|
|[-0.7302967433402...|         2|         1|
|[-0.3651483716701...|         2|         1|
|[0.0,-1.286034203...|         1|         0|
|[0.36514837167011...|         2|         1|
|[0.73029674334022...|         0|         1|
|[1.09544511501033...|         0|         1|
|[1.46059348668044...|         0|         1|
+--------------------+----------+----------+



# 4. Evaluate the Effectiveness of K-means Clustering

In [15]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator(featuresCol="scaled_features", predictionCol="prediction")

silhouette = evaluator.evaluate(predictions)
print(f"Silhouette with squared euclidean distance = {silhouette}")

# Show the number of anomalies detected
num_anomalies = predictions.filter(predictions.is_anomaly == 1).count()
print(f"Number of anomalies detected: {num_anomalies}")


Silhouette with squared euclidean distance = 0.8060494821051775
Number of anomalies detected: 6
