# Big Data Machine Learning Classification with Spark

With this project, we will create a machine learning model to predict customers' behavior using spark and evaluate the performance of this model.

### Install Spark

In [1]:
#pip install pyspark

### Import Libraries

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand

In [3]:
# Step 1: Initialize Spark
spark = SparkSession.builder.appName("Customers'BehaviorPrediction").getOrCreate()

In [4]:
# Step 2: Load the dataset
data = spark.read.csv("churn.csv", inferSchema=True, header=True)

In [5]:
data

DataFrame[_c0: int, Names: string, Age: double, Total_Purchase: double, Account_Manager: int, Years: double, Num_Sites: double, Churn: int]

In [6]:
data.count()   #number of rows

900

In [7]:
len(data.columns)   #number of cols

8

In [8]:
data.show(5)

+---+----------------+----+--------------+---------------+-----+---------+-----+
|_c0|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|
+---+----------------+----+--------------+---------------+-----+---------+-----+
|  0|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|    1|
|  1|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|    1|
|  2|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|    1|
|  3|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|    1|
|  4|  Cynthia Norton|37.0|       9191.58|              0| 5.56|      9.0|    1|
+---+----------------+----+--------------+---------------+-----+---------+-----+
only showing top 5 rows



In [9]:
# Step 3: Prepare the data for training
feature_columns = data.columns[:-1]
# Remove the "Names" column
if "Names" in feature_columns:
    feature_columns.remove("Names")
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data).select("features", "Churn")

In [10]:
data_random = data.orderBy(rand()).limit(10)
data_random.show()

+--------------------+-----+
|            features|Churn|
+--------------------+-----+
|[826.0,33.0,12249...|    0|
|[619.0,45.0,8871....|    0|
|[501.0,40.0,10762...|    0|
|[653.0,25.0,9672....|    0|
|[886.0,46.0,13547...|    0|
|[846.0,44.0,5002....|    0|
|[894.0,45.0,4863....|    0|
|[9.0,40.0,8066.94...|    1|
|[442.0,40.0,10780...|    0|
|[694.0,41.0,8907....|    0|
+--------------------+-----+



In [11]:
# Step 4: Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

In [12]:
train_data.show(3)

+--------------------+-----+
|            features|Churn|
+--------------------+-----+
|[0.0,42.0,11066.8...|    1|
|[1.0,41.0,11916.2...|    1|
|[3.0,42.0,8010.76...|    1|
+--------------------+-----+
only showing top 3 rows



In [13]:
test_data.show(3)

+--------------------+-----+
|            features|Churn|
+--------------------+-----+
|[2.0,38.0,12884.7...|    1|
|[6.0,44.0,11331.5...|    1|
|[8.0,43.0,14062.6...|    1|
+--------------------+-----+
only showing top 3 rows



In [14]:
# Step 5: Train a logistic regression model
gbt = GBTClassifier(labelCol="Churn", featuresCol="features")
model = gbt.fit(train_data)

In [15]:
# Step 6: Make predictions on the test data
predictions = model.transform(test_data)

In [16]:
predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[2.0,38.0,12884.7...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|[6.0,44.0,11331.5...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|[8.0,43.0,14062.6...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|[9.0,40.0,8066.94...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|[13.0,40.0,8283.3...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|[14.0,41.0,6569.8...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|[15.0,38.0,10494....|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|[19.0,46.0,8046.4...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|[21.0,56.0,12217....|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|[23.0,41.0,1047

In [17]:
# Step 7: Evaluate the model
evaluator1 = BinaryClassificationEvaluator(labelCol="Churn")
accuracy = evaluator1.evaluate(predictions)

evaluator2 = MulticlassClassificationEvaluator(labelCol="Churn", metricName="f1")
f1_score = evaluator2.evaluate(predictions)

print("Accuracy:", accuracy, "F1 Score:", f1_score)

Accuracy: 0.9973262032085561 F1 Score: 0.9957254758004819
