In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline

spark = SparkSession.builder.appName("ChurnPrediction").getOrCreate()

data = [
    (30, 0.5, 5, 0.1, 0), (60, 1.2, 20, 0.3, 1), (50, 1.0, 15, 0.2, 0),
    (40, 1.5, 30, 0.5, 0), (80, 2.0, 25, 0.4, 1), (55, 1.2, 10, 0.1, 0),
    (45, 0.8, 40, 0.2, 0), (90, 2.5, 35, 0.3, 1), (20, 0.5, 50, 0.5, 1),
    (35, 0.2, 5, 0.1, 0), (70, 0.1, 10, 0.2, 0), (65, 0.5, 20, 0.1, 1),
    (30, 0.3, 15, 0.3, 0), (85, 0.4, 25, 0.4, 1), (75, 0.8, 30, 0.5, 1),
    (95, 0.7, 35, 0.2, 0), (100, 1.5, 40, 0.3, 0), (25, 1.8, 45, 0.4, 1),
    (15, 1.0, 50, 0.5, 1), (10, 0.9, 5, 0.1, 0),
    (50, 0.6, 10, 0.3, 0), (60, 0.4, 20, 0.2, 0), (55, 0.2, 30, 0.3, 1),
    (45, 0.3, 40, 0.1, 0), (90, 0.5, 10, 0.4, 1), (95, 0.4, 15, 0.2, 0),
    (40, 0.5, 20, 0.3, 1), (30, 0.7, 25, 0.4, 1), (20, 0.3, 30, 0.5, 0),
    (10, 0.2, 35, 0.4, 1), (0, 0.1, 5, 0.1, 0), (10, 0.5, 20, 0.2, 1),
    (20, 0.6, 15, 0.3, 0), (30, 0.7, 10, 0.1, 1), (40, 0.9, 25, 0.4, 0),
    (50, 0.8, 35, 0.5, 1), (40, 0.4, 20, 0.3, 0), (35, 0.5, 30, 0.4, 1),
    (30, 0.6, 40, 0.5, 0), (25, 0.7, 15, 0.1, 1), (20, 0.3, 5, 0.2, 0),
    (15, 0.2, 10, 0.3, 0), (10, 0.1, 5, 0.1, 1), (5, 0.0, 0, 0.0, 0),
    (100, 0.6, 10, 0.2, 0), (95, 0.7, 20, 0.3, 1), (90, 0.8, 15, 0.2, 0),
    (85, 0.9, 25, 0.4, 1), (80, 0.8, 30, 0.5, 1), (75, 0.7, 35, 0.4, 0),
    (70, 0.6, 40, 0.3, 0), (60, 0.5, 5, 0.1, 0), (50, 0.4, 20, 0.2, 1),
    (40, 0.5, 15, 0.3, 0), (30, 0.6, 10, 0.1, 0), (20, 0.2, 25, 0.4, 1),
    (10, 0.3, 30, 0.5, 1), (0, 0.1, 5, 0.2, 0), (10, 0.5, 20, 0.3, 0),
    (20, 0.6, 15, 0.4, 1), (30, 0.7, 25, 0.5, 1), (40, 0.4, 10, 0.1, 0),
    (50, 0.3, 5, 0.2, 0), (60, 0.2, 20, 0.3, 1), (70, 0.5, 15, 0.4, 1)
]

columns = ['Network Quality: Latency', 'Network Quality: Dropped Calls', 'Internet Usage (GB)', 'Network Quality: Data Drops', 'Churn']
df = spark.createDataFrame(data, columns)

train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

assembler = VectorAssembler(
    inputCols=['Network Quality: Latency', 'Network Quality: Dropped Calls', 'Internet Usage (GB)', 'Network Quality: Data Drops'],
    outputCol="features"
)

scaler = StandardScaler(
    inputCol="features", outputCol="scaled_features", withStd=True, withMean=True
)

dt = DecisionTreeClassifier(
    featuresCol="scaled_features", labelCol="Churn", maxDepth=5
)

pipeline = Pipeline(stages=[assembler, scaler, dt])

model = pipeline.fit(train_df)

predictions = model.transform(test_df)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="Churn", predictionCol="prediction", metricName="precisionByLabel"
)
recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="Churn", predictionCol="prediction", metricName="recallByLabel"
)
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="Churn", predictionCol="prediction", metricName="f1"
)

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)
f1 = f1_evaluator.evaluate(predictions)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

accuracy_evaluator = BinaryClassificationEvaluator(
    labelCol="Churn", rawPredictionCol="prediction"
)
accuracy = accuracy_evaluator.evaluate(predictions)
print(f'Accuracy: {accuracy:.2f}')


Precision: 0.70
Recall: 0.78
F1 Score: 0.63
Accuracy: 0.59
