<a href="https://colab.research.google.com/github/usshaa/SMBDA/blob/main/C-5.10%3A%20Predicting_Customer_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Building Machine Learning Pipelines in PySpark

Machine learning pipelines in PySpark streamline the process of building and deploying machine learning models by chaining multiple stages, including data preprocessing, feature engineering, model training, and evaluation, into a cohesive workflow. Here's a detailed guide on building a machine learning pipeline using PySpark.

### Example: Predicting Customer Churn

We'll build a machine learning pipeline for a customer churn prediction task using a hypothetical dataset with columns: `User Id`, `First Name`, `Last Name`, `Sex`, `Email`, `Phone`, `Date of birth`, `Job Title`, and a binary label `Churn`.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff, current_date
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from faker import Faker
from random import choice
import pandas as pd

[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-4125139553725403>:7[0m
[1;32m      5[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01mml[39;00m [38;5;28;01mimport[39;00m Pipeline
[1;32m      6[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01mml[39;00m[38;5;21;01m.[39;00m[38;5;21;01mevaluation[39;00m [38;5;28;01mimport[39;00m BinaryClassificationEvaluator
[0;32m----> 7[0m [38;5;28;01mfrom[39;00m [38;5;21;01mfaker[39;00m [38;5;28;01mimport[39;00m Faker
[1;32m      8[0m [38;5;28;01mfrom[39;00m [38;5;21;01mrandom[39;00m [38;5;28;01mimport[39;00m choice
[1;32m      9[0m [38;5;28;01mimport[39;00m [38;5;21;01mpandas[39;00m [38;5;28;01mas[39;00m [38;5;21;01mpd[39;00m

File [0;32m/databricks/python_shell/dbruntime/PythonPack

In [None]:
# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("ML Pipeline Example") \
    .getOrCreate()

In [None]:
# Step 2: Load Data
data = spark.read.csv("/FileStore/tables/customer_data.csv", header=True, inferSchema=True)

In [None]:
# Step 3: Data Preprocessing
indexer_sex = StringIndexer(inputCol="Sex", outputCol="SexIndex")
indexer_job = StringIndexer(inputCol="Job Title", outputCol="JobTitleIndex", handleInvalid="keep")

In [None]:
# Calculate Age from Date of Birth
data = data.withColumn("Age", (datediff(current_date(), col("Date of birth")) / 365).cast("int"))

In [None]:
# Select relevant features for the model
assembler = VectorAssembler(
    inputCols=["Age", "SexIndex", "JobTitleIndex"],
    outputCol="features"
)

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [None]:
# Step 4: Build Machine Learning Pipeline
lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="Churn")
pipeline = Pipeline(stages=[indexer_sex, indexer_job, assembler, scaler, lr])

In [None]:
# Step 5: Train and Evaluate the Model
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [None]:
model = pipeline.fit(train_data)
predictions = model.transform(test_data)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="Churn")

In [None]:
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.54


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)

evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)

evaluator = MulticlassClassificationEvaluator(labelCol="Churn", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)

# Print the evaluation metrics
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1_score:.2f}")

Precision: 0.58
Recall: 0.56
F1-score: 0.53


In [None]:
from pyspark.sql import functions as F
import pandas as pd

# Assuming 'predictions' is your DataFrame containing predictions
predictions = predictions.withColumn("prediction", F.col("prediction").cast("double"))
predictions = predictions.withColumn("Churn", F.col("Churn").cast("double"))

# Group by true and predicted labels and count occurrences
conf_matrix = predictions.groupBy("Churn", "prediction").count()

# Convert to Pandas DataFrame for easier manipulation
conf_matrix_pd = conf_matrix.toPandas()

# Pivot the DataFrame to get confusion matrix
conf_matrix_pd = conf_matrix_pd.pivot(index='Churn', columns='prediction', values='count').fillna(0)

# Display confusion matrix
print("Confusion Matrix:")
conf_matrix_pd



Confusion Matrix:


prediction,0.0,1.0
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,26,56
1.0,16,64


In [None]:
# Step 6: Save and Load Model (Optional)
model.save("pipeline_model")