In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LinearSVC, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName("MLP_SVM_Iris").getOrCreate()

# Load the dataset
data = spark.read.csv("IRIS.csv", header=True, inferSchema=True)

# Prepare the features
feature_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

# Convert string labels to numerical values
label_indexer = StringIndexer(inputCol="species", outputCol="label")
data = label_indexer.fit(data).transform(data)

# Split the dataset into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)




In [None]:
data

DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string, features: vector, label: double]

In [None]:
mlp = MultilayerPerceptronClassifier(featuresCol="features", labelCol="label", layers=[4, 10, 5, 3])  # Adjust layers as needed
mlp_model = mlp.fit(train_data)

# Evaluate the models
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")



In [None]:
mlp_predictions = mlp_model.transform(test_data)
mlp_accuracy = evaluator.evaluate(mlp_predictions)
print("MLP Accuracy:", mlp_accuracy)

# Stop the Spark session
spark.stop()

MLP Accuracy: 0.9565217391304348


In [None]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
num_samples = 1000  # Number of samples
num_features = 2    # Number of features

# Create feature matrix
X = np.random.rand(num_samples, num_features)

# Create labels (binary classification problem)
y = np.random.randint(2, size=num_samples)

# Save the synthetic dataset as a CSV file
data = np.column_stack((X, y))
df = pd.DataFrame(data, columns=[f'feature{i}' for i in range(num_features)] + ['label'])
df.to_csv('synthetic_dataset.csv', index=False)


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName("LinearSVCExample").getOrCreate()

# Load the dataset
data = spark.read.csv("synthetic_dataset.csv", header=True, inferSchema=True)

# Prepare the features
feature_columns = ["feature0", "feature1"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

# Split the dataset into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

# Implement LinearSVC model
svm = LinearSVC(featuresCol="features", labelCol="label")
svm_model = svm.fit(train_data)

# Make predictions
svm_predictions = svm_model.transform(test_data)

# Evaluate the model using accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(svm_predictions)
print("Accuracy:", accuracy)

# Stop the Spark session
spark.stop()


Accuracy: 0.5703125
