In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, when

spark = SparkSession.builder \
    .appName("DataPreprocessing") \
    .getOrCreate()

df = spark.read.option("header", "true").csv("your_data.csv")

df.show()

numerical_cols = ['col1', 'col2']  
categorical_cols = ['cat_col1']    


for col_name in numerical_cols:
    mean_value = df.agg({col_name: "mean"}).collect()[0][0]
    df = df.fillna({col_name: mean_value})

df = df.fillna({'cat_col1': 'missing_value'}) 


assembler = VectorAssembler(inputCols=numerical_cols, outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

pipeline = Pipeline(stages=[assembler, scaler])

model = pipeline.fit(df)
scaled_df = model.transform(df)

scaled_df.select('scaled_features').show()

spark.stop()


In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("KMeansClustering") \
    .getOrCreate()
data = [
    (1, 1.0, 2.0),
    (2, 1.5, 1.8),
    (3, 5.0, 8.0),
    (4, 8.0, 8.0),
    (5, 1.0, 0.6),
    (6, 9.0, 11.0),
    (7, 8.0, 2.0),
    (8, 10.0, 2.0),
    (9, 9.0, 3.0)
]

df = spark.createDataFrame(data, ["id", "x", "y"])

assembler = VectorAssembler(inputCols=["x", "y"], outputCol="features")
df = assembler.transform(df)

k = 3

kmeans = KMeans(k=k, seed=1)
model = kmeans.fit(df)

predictions = model.transform(df)

print("Cluster Centers:")
centers = model.clusterCenters()
for center in centers:
    print(center)

print("Predictions:")
predictions.select("id", "features", "prediction").show()



24/10/07 10:07:27 WARN Utils: Your hostname, PGLab2 resolves to a loopback address: 127.0.1.1; using 172.16.57.83 instead (on interface enp1s0)
24/10/07 10:07:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/07 10:07:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/07 10:07:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/10/07 10:07:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Cluster Centers:
[7.33333333 9.        ]
[1.16666667 1.46666667]
[9.         2.33333333]
Predictions:
+---+----------+----------+
| id|  features|prediction|
+---+----------+----------+
|  1| [1.0,2.0]|         1|
|  2| [1.5,1.8]|         1|
|  3| [5.0,8.0]|         0|
|  4| [8.0,8.0]|         0|
|  5| [1.0,0.6]|         1|
|  6|[9.0,11.0]|         0|
|  7| [8.0,2.0]|         2|
|  8|[10.0,2.0]|         2|
|  9| [9.0,3.0]|         2|
+---+----------+----------+



In [16]:
df = spark.createDataFrame(data, ["id", "x", "y"])
assemebler = VectorAssembler(inputCols=["x","y"],outputCol="features")
df = assemebler.transform(df)
k=3
kmeans=KMeans(k=k,seed=1)
model =kmeans.fit(df)
predictions = model.transform(df)
print("Cluster Center:")
centers = model.clusterCenters()
for center in centers:
    print(center)
    print("predictions")
    predictions.select("id","features")
    predictions.show()
spark.stop()

Cluster Center:
[7.33333333 9.        ]
predictions
+---+----+----+----------+----------+
| id|   x|   y|  features|prediction|
+---+----+----+----------+----------+
|  1| 1.0| 2.0| [1.0,2.0]|         1|
|  2| 1.5| 1.8| [1.5,1.8]|         1|
|  3| 5.0| 8.0| [5.0,8.0]|         0|
|  4| 8.0| 8.0| [8.0,8.0]|         0|
|  5| 1.0| 0.6| [1.0,0.6]|         1|
|  6| 9.0|11.0|[9.0,11.0]|         0|
|  7| 8.0| 2.0| [8.0,2.0]|         2|
|  8|10.0| 2.0|[10.0,2.0]|         2|
|  9| 9.0| 3.0| [9.0,3.0]|         2|
+---+----+----+----------+----------+

[1.16666667 1.46666667]
predictions
+---+----+----+----------+----------+
| id|   x|   y|  features|prediction|
+---+----+----+----------+----------+
|  1| 1.0| 2.0| [1.0,2.0]|         1|
|  2| 1.5| 1.8| [1.5,1.8]|         1|
|  3| 5.0| 8.0| [5.0,8.0]|         0|
|  4| 8.0| 8.0| [8.0,8.0]|         0|
|  5| 1.0| 0.6| [1.0,0.6]|         1|
|  6| 9.0|11.0|[9.0,11.0]|         0|
|  7| 8.0| 2.0| [8.0,2.0]|         2|
|  8|10.0| 2.0|[10.0,2.0]|         2|