<a href="https://colab.research.google.com/github/vineeth-10/BDA_02/blob/main/BDA_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Build a Classification Model with Spark with a dataset of your choice**

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Step 1: Start Spark Session
spark = SparkSession.builder.appName("IrisClassification").getOrCreate()

# Step 2: Load Dataset
df = spark.read.csv("/content/iris.csv", header=True, inferSchema=True)

# Step 3: Convert label (species) to numeric
indexer = StringIndexer(inputCol="species", outputCol="label")
df = indexer.fit(df).transform(df)

# Step 4: Assemble features into a vector
assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)
data = assembler.transform(df).select("features", "label")

# Step 5: Split dataset into training and test sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

# Step 6: Train Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(train_data)

# Step 7: Make Predictions
predictions = model.transform(test_data)

# Step 8: Evaluate Model Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy = {accuracy:.2f}")

# Stop Spark Session
spark.stop()


Test Accuracy = 1.00


**2. Build a Clustering Model with Spark with a dataset of your choice**

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Step 1: Create Spark session
spark = SparkSession.builder.appName("IrisClustering").getOrCreate()

# Step 2: Load dataset
df = spark.read.csv("/content/iris.csv", header=True, inferSchema=True)

# Step 3: Drop the label column (species) for unsupervised learning
df = df.drop("species")
df = df.toDF(*[c.strip() for c in df.columns])  # Remove any extra spaces

# Step 4: Assemble features
assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)
feature_data = assembler.transform(df).select("features")

# Step 5: Train KMeans model
kmeans = KMeans(k=3, seed=1)  # 3 clusters (one for each Iris species)
model = kmeans.fit(feature_data)

# Step 6: Make predictions
predictions = model.transform(feature_data)

# Step 7: Evaluate clustering using Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)

print(f"Silhouette Score = {silhouette:.2f}")
print("Cluster Centers:")
for center in model.clusterCenters():
    print(center)

# Stop Spark
spark.stop()


Silhouette Score = 0.59
Cluster Centers:
[4.9        3.23333333 1.36666667 0.2       ]
[6.  2.2 5.  1.5]
[6.4 2.8 5.6 2.1]


**3. Build a Recommendation Engine with Spark with a dataset of your
choice**


In [8]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Step 1: Start Spark Session
spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

# Step 2: Load or create a ratings dataset
# Sample data: userId, movieId, rating
data = [
    (0, 0, 4.0), (0, 1, 2.0), (0, 2, 5.0),
    (1, 0, 5.0), (1, 2, 3.0),
    (2, 1, 4.0), (2, 2, 2.0)
]
columns = ["userId", "movieId", "rating"]
df = spark.createDataFrame(data, columns)

# Step 3: Split the data
(training, test) = df.randomSplit([0.8, 0.2])

# Step 4: Build ALS recommendation model
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)
model = als.fit(training)

# Step 5: Evaluate the model
predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse:.2f}")

# Step 6: Generate top 3 movie recommendations for each user
user_recs = model.recommendForAllUsers(3)
user_recs.show(truncate=False)

# Step 7: Generate top 3 user recommendations for each movie
movie_recs = model.recommendForAllItems(3)
movie_recs.show(truncate=False)

# Stop Spark
spark.stop()


Root-mean-square error = 3.05
+------+------------------------------------------------+
|userId|recommendations                                 |
+------+------------------------------------------------+
|0     |[{0, 3.9031894}, {1, 2.0078704}, {2, 1.0150895}]|
|1     |[{0, 4.941399}, {1, 2.6577547}, {2, 1.3459307}] |
|2     |[{1, 3.8469234}, {0, 3.6589017}, {2, 1.983662}] |
+------+------------------------------------------------+

+-------+------------------------------------------------+
|movieId|recommendations                                 |
+-------+------------------------------------------------+
|0      |[{1, 4.941399}, {0, 3.9031894}, {2, 3.6589017}] |
|1      |[{2, 3.8469234}, {1, 2.6577547}, {0, 2.0078704}]|
|2      |[{2, 1.983662}, {1, 1.3459307}, {0, 1.0150895}] |
+-------+------------------------------------------------+

