In [20]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

In [21]:
# Load the Titanic dataset
spark = SparkSession.builder.appName("TitanicSVM").getOrCreate()



In [22]:
titanic_data = spark.read.csv("titanic.csv", header=True, inferSchema=True)


In [23]:
# Select relevant features and label
selected_columns = ["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex", "Survived"]
data = titanic_data.select(*selected_columns).na.drop()

In [24]:
# Convert categorical features to numerical using OneHotEncoding
from pyspark.ml.feature import StringIndexer, OneHotEncoder
sex_indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
sex_encoder = OneHotEncoder(inputCol="SexIndex", outputCol="SexVec")
assembler = VectorAssembler(inputCols=["Pclass", "Age", "SibSp", "Parch", "Fare", "SexVec"],
                            outputCol="features")

In [25]:
# Create an SVM model
svm = LinearSVC(featuresCol="features", labelCol="Survived", maxIter=10)

In [26]:
# Create a pipeline
pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler, svm])


In [28]:
# Split data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=123)

In [29]:
# Train the SVM model
svm_model = pipeline.fit(train_data)


In [30]:
# Make predictions on the test data
predictions = svm_model.transform(test_data)

In [31]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="Survived")
accuracy = evaluator.evaluate(predictions)

In [32]:
print(f"Accuracy: {accuracy}")


Accuracy: 0.8492000790045426


In [33]:
# Stop the Spark session
spark.stop()