In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLTrainingDemo") \
    .master("local[*]") \
    .getOrCreate()


In [3]:
df = spark.read.csv("titanic_downloaded.csv", header=True, inferSchema=True)
print("Data count:", df.count())
df.printSchema()
df.show(5)


Data count: 891
root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0

In [4]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [5]:
# For simplicity, drop rows where any column is null
df_clean = df.dropna(subset=["Age", "Sex", "Pclass", "Survived"])


In [6]:
gender_indexer = StringIndexer(inputCol="Sex", outputCol="SexIndexed")
df_indexed = gender_indexer.fit(df_clean).transform(df_clean)


In [7]:
feature_assembler = VectorAssembler(
    inputCols=["Pclass", "Age", "SexIndexed"], 
    outputCol="features"
)
df_features = feature_assembler.transform(df_indexed)


In [9]:
# Rename Survived -> label
df_final = df_features.withColumnRenamed("Survived", "label")
df_final.select("features", "label").show(5)


+--------------+-----+
|      features|label|
+--------------+-----+
|[3.0,22.0,0.0]|    0|
|[1.0,38.0,1.0]|    1|
|[3.0,26.0,1.0]|    1|
|[1.0,35.0,1.0]|    1|
|[3.0,35.0,0.0]|    0|
+--------------+-----+
only showing top 5 rows



In [10]:
train_df, test_df = df_final.randomSplit([0.7, 0.3], seed=42)
print("Train count:", train_df.count(), "Test count:", test_df.count())


Train count: 526 Test count: 188


In [11]:
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(train_df)


In [12]:
predictions = model.transform(test_df)

evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
print("Test AUC:", auc)


Test AUC: 0.8296586059743952


In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = acc_evaluator.evaluate(predictions)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.7925531914893617


In [None]:
spark.stop()