Write and implement the following problem in PySpark.

The youtube.csv file contains the following fields:

a. Source (News channel links)
b. Target (YouTube video links)
c. Link(Count)

A. Apply KMEANS clustering algorithm on above dataset

B. Use hyperparameters to improve the result of KMEANS

C. Find no of points in smallest cluster

D. Apply RandomForest algorithm on the above dataset

E. Use hyperparameters to improve the result of RandomForest


In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import ClusteringEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Initialize Spark Session
spark = SparkSession.builder.appName("KMeans_RandomForest_Analysis").getOrCreate()




In [2]:
# Load the data
df = spark.read.csv("youtube.csv", header=True, inferSchema=True).limit(1000)

# Show the schema and first few records to understand the data
df.printSchema()
df.show(5)


root
 |-- Source: string (nullable = true)
 |-- Target: string (nullable = true)
 |-- Link: integer (nullable = true)

+--------------------+--------------------+----+
|              Source|              Target|Link|
+--------------------+--------------------+----+
|https://www.youtu...|https://www.youtu...|   1|
|https://www.youtu...|https://www.youtu...|   1|
|https://www.youtu...|https://www.youtu...|   1|
|https://www.youtu...|https://www.youtu...|   1|
|https://www.youtu...|https://www.youtu...|   1|
+--------------------+--------------------+----+
only showing top 5 rows



# Task A

In [3]:
# Assemble features (just use 'Link' as the feature)
assembler = VectorAssembler(inputCols=["Link"], outputCol="features")
df_assembled = assembler.transform(df)

# Apply KMeans Clustering
kmeans = KMeans().setK(3).setSeed(1)  # Set K to 3 clusters (adjust as needed)
model = kmeans.fit(df_assembled)
predictions = model.transform(df_assembled)

# Show the clustering results
predictions.select("Source", "Target", "Link", "prediction").show(5)


+--------------------+--------------------+----+----------+
|              Source|              Target|Link|prediction|
+--------------------+--------------------+----+----------+
|https://www.youtu...|https://www.youtu...|   1|         0|
|https://www.youtu...|https://www.youtu...|   1|         0|
|https://www.youtu...|https://www.youtu...|   1|         0|
|https://www.youtu...|https://www.youtu...|   1|         0|
|https://www.youtu...|https://www.youtu...|   1|         0|
+--------------------+--------------------+----+----------+
only showing top 5 rows



# Task B

In [4]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Set up the hyperparameter grid
paramGrid = (ParamGridBuilder()
             .addGrid(kmeans.k, [2, 3, 4, 5])  # Try different values of K
             .addGrid(kmeans.maxIter, [10, 20, 30])  # Varying max iterations
             .build())

# Set up cross-validation
evaluator = ClusteringEvaluator(predictionCol="prediction")
crossval = CrossValidator(estimator=kmeans,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)  # 3-fold cross-validation

# Run cross-validation and choose the best model
cvModel = crossval.fit(df_assembled)

# Get the best model and print the results
bestModel = cvModel.bestModel
print(f"Best number of clusters (K): {bestModel.getK()}")


Py4JJavaError: An error occurred while calling o134.evaluate.
: java.lang.AssertionError: assertion failed: Number of clusters must be greater than one.
	at scala.Predef$.assert(Predef.scala:223)
	at org.apache.spark.ml.evaluation.SquaredEuclideanSilhouette$.computeSilhouetteScore(ClusteringMetrics.scala:401)
	at org.apache.spark.ml.evaluation.ClusteringMetrics.silhouette(ClusteringMetrics.scala:55)
	at org.apache.spark.ml.evaluation.ClusteringEvaluator.evaluate(ClusteringEvaluator.scala:109)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:76)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)


In [5]:
# Apply KMeans with a new value of K
kmeans = KMeans(k=2, seed=1)  # Trying K=2 clusters
model = kmeans.fit(df_assembled)
predictions = model.transform(df_assembled)

# Check how many unique clusters are produced
cluster_counts = predictions.select("prediction").distinct().count()
print(f"Number of unique clusters: {cluster_counts}")

# Only evaluate if there is more than one cluster
if cluster_counts > 1:
    # Apply ClusteringEvaluator
    evaluator = ClusteringEvaluator(predictionCol="prediction")
    silhouette_score = evaluator.evaluate(predictions)
    print(f"Silhouette score: {silhouette_score}")
else:
    print("Clustering failed to produce multiple clusters.")


Number of unique clusters: 1
Clustering failed to produce multiple clusters.


# Task C

In [6]:
# Count the number of points in each cluster
cluster_sizes = predictions.groupBy("prediction").count().orderBy("count")

# Show the sizes of the clusters
cluster_sizes.show()

# Find the smallest cluster size
smallest_cluster_size = cluster_sizes.agg({"count": "min"}).collect()[0][0]
print(f"Smallest cluster has {smallest_cluster_size} points.")


+----------+-----+
|prediction|count|
+----------+-----+
|         0| 1000|
+----------+-----+

Smallest cluster has 1000 points.


# Task D

In [None]:
# RandomForest requires a label column, so we'll assume 'Link' as a label
df_rf = df.select("Link", "Source", "Target")

# Prepare the features by vectorizing 'Source' and 'Target'
# We need to convert the categorical columns to numerical form using StringIndexer

from pyspark.ml.feature import StringIndexer

source_indexer = StringIndexer(inputCol="Source", outputCol="SourceIndex")
target_indexer = StringIndexer(inputCol="Target", outputCol="TargetIndex")

df_rf = source_indexer.fit(df_rf).transform(df_rf)
df_rf = target_indexer.fit(df_rf).transform(df_rf)

# Assemble features from SourceIndex, TargetIndex
assembler_rf = VectorAssembler(inputCols=["SourceIndex", "TargetIndex"], outputCol="features")
df_rf_assembled = assembler_rf.transform(df_rf)

# Now, apply RandomForestClassifier
rf = RandomForestClassifier(labelCol="Link", featuresCol="features", maxBins=500)
rf_model = rf.fit(df_rf_assembled)

# Make predictions
rf_predictions = rf_model.transform(df_rf_assembled)
rf_predictions.select("Link", "prediction").show(5)


# Task E

In [None]:
# Set up hyperparameter grid for RandomForest
paramGrid_rf = (ParamGridBuilder()
                .addGrid(rf.numTrees, [10, 20, 30])  # Number of trees
                .addGrid(rf.maxDepth, [5, 10, 15])   # Depth of the trees
                .build())

# Set up cross-validation for RandomForest
evaluator_rf = MulticlassClassificationEvaluator(labelCol="Link", predictionCol="prediction")
crossval_rf = CrossValidator(estimator=rf,
                             estimatorParamMaps=paramGrid_rf,
                             evaluator=evaluator_rf,
                             numFolds=3)  # 3-fold cross-validation

# Run cross-validation and choose the best model
cvModel_rf = crossval_rf.fit(df_rf_assembled)

# Get the best model and print the results
best_rf_model = cvModel_rf.bestModel
print(f"Best number of trees: {best_rf_model.getNumTrees}")
print(f"Best tree depth: {best_rf_model.getMaxDepth}")
