### Read the data file

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('churn_log_reg').getOrCreate()
df = spark.read.csv('customer_churn.csv', inferSchema = True, header = True)
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [2]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

### Summary Statistics

In [5]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Names,900,,,Aaron King,Zachary Walsh
Age,900,41.81666666666667,6.127560416916251,22.0,65.0
Total_Purchase,900,10062.82403333334,2408.644531858096,100.0,18026.01
Account_Manager,900,0.4811111111111111,0.4999208935073339,0,1
Years,900,5.27315555555555,1.274449013194616,1.0,9.15
Num_Sites,900,8.587777777777777,1.7648355920350969,3.0,14.0
Location,900,,,"00103 Jeffrey Crest Apt. 205 Padillaville, IA ...",Unit 9800 Box 2878 DPO AA 75157
Company,900,,,Abbott-Thompson,"Zuniga, Clark and Shaffer"
Churn,900,0.16666666666666666,0.3728852122772358,0,1


### Build the Pipeline

In [10]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

assembler = VectorAssembler(inputCols = ['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites',], outputCol = 'features')
log_reg = LogisticRegression(featuresCol = 'features', labelCol = 'Churn', maxIter=10)
pipeline = Pipeline(stages = [assembler, log_reg])

In [12]:
train, test = df.randomSplit([0.7, 0.3])
lrModel = pipeline.fit(train)
predictions = lrModel.transform(test)

In [13]:
predictions.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



### Make Predictions

In [16]:
predictions.select('Churn', 'prediction', 'probability', 'rawPrediction').show(4)

+-----+----------+--------------------+--------------------+
|Churn|prediction|         probability|       rawPrediction|
+-----+----------+--------------------+--------------------+
|    0|       0.0|[0.92746093189776...|[2.54832538455355...|
|    0|       0.0|[0.99571902458018...|[5.44928422849747...|
|    0|       0.0|[0.87982491526622...|[1.99077320888602...|
|    0|       0.0|[0.97001076460616...|[3.47646867190283...|
+-----+----------+--------------------+--------------------+
only showing top 4 rows



### Performance Evaluation

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

eval = BinaryClassificationEvaluator(labelCol = 'Churn', rawPredictionCol = 'rawPrediction')
eval.evaluate(predictions)

0.8076314401266057

### Predict on the new data

In [21]:
new_data = spark.read.csv('new_customers.csv', inferSchema=True, header=True)
new_data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [22]:
lrModel_new = pipeline.fit(df)

In [23]:
predictions_new = lrModel_new.transform(new_data)

In [26]:
predictions_new.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [28]:
predictions_new.select('Names', 'Company', 'prediction').show()

+--------------+----------------+----------+
|         Names|         Company|prediction|
+--------------+----------------+----------+
| Andrew Mccall|        King Ltd|       0.0|
|Michele Wright|   Cannon-Benson|       1.0|
|  Jeremy Chang|Barron-Robertson|       1.0|
|Megan Ferguson|   Sexton-Golden|       1.0|
|  Taylor Young|        Wood LLC|       0.0|
| Jessica Drake|   Parks-Robbins|       1.0|
+--------------+----------------+----------+

