In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



In [6]:
#Libraries and class from StaticModel
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# import pickle
import time

#Script Class
from PreProcess import *
from PostProcess import *

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [7]:
spark = SparkSession.builder \
        .appName("Churn EDA PySpark") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .getOrCreate()

In [9]:


schema = StructType([
    StructField("CustomerId", IntegerType(), True),
    StructField("Churn", IntegerType(), True),
    StructField("Tenure", FloatType(), True),
    StructField("PreferredLoginDevice", StringType(), True),
    StructField("CityTier", IntegerType(), True),
    StructField("WarehouseToHome", FloatType(), True),
    StructField("PreferredPaymentMode", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("HourSpendOnApp", FloatType(), True),
    StructField("NumberOfDeviceRegistered", IntegerType(), True),
    StructField("PreferedOrderCat", StringType(), True),
    StructField("SatisfactionScore", IntegerType(), True),
    StructField("MaritalStatus", StringType(), True),
    StructField("NumberOfAddress", IntegerType(), True),
    StructField("Complain", IntegerType(), True),
    StructField("OrderAmountHikeFromlastYear", FloatType(), True),
    StructField("CouponUsed", FloatType(), True),
    StructField("OrderCount", FloatType(), True),
    StructField("DaySinceLastOrder", FloatType(), True),
    StructField("CashbackAmount", FloatType(), True)
])

In [18]:
# Load the data stored in e-commerce folder format as a DataFrame.
data = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("/home/jovyan/code/churn/e-commerce/e-commerce-dataset.csv")


In [19]:
data.count()

5630

In [20]:
data.select(F.countDistinct("CustomerId")).show()
categorical_cols = ['PreferredLoginDevice',
  'PreferredPaymentMode',
  'Gender',
  'PreferedOrderCat',
  'MaritalStatus']
for c in categorical_cols:
    data.select(F.countDistinct(c)).show()

+--------------------------+
|count(DISTINCT CustomerId)|
+--------------------------+
|                      5630|
+--------------------------+

+------------------------------------+
|count(DISTINCT PreferredLoginDevice)|
+------------------------------------+
|                                   3|
+------------------------------------+

+------------------------------------+
|count(DISTINCT PreferredPaymentMode)|
+------------------------------------+
|                                   7|
+------------------------------------+

+----------------------+
|count(DISTINCT Gender)|
+----------------------+
|                     2|
+----------------------+

+--------------------------------+
|count(DISTINCT PreferedOrderCat)|
+--------------------------------+
|                               6|
+--------------------------------+

+-----------------------------+
|count(DISTINCT MaritalStatus)|
+-----------------------------+
|                            3|
+-----------------------------+


AttributeError: 'DataFrame' object has no attribute 'GroupedData'

In [50]:
data.printSchema()


root
 |-- CustomerId: integer (nullable = true)
 |-- Churn: integer (nullable = true)
 |-- Tenure: float (nullable = true)
 |-- PreferredLoginDevice: string (nullable = true)
 |-- CityTier: integer (nullable = true)
 |-- WarehouseToHome: float (nullable = true)
 |-- PreferredPaymentMode: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourSpendOnApp: float (nullable = true)
 |-- NumberOfDeviceRegistered: integer (nullable = true)
 |-- PreferedOrderCat: string (nullable = true)
 |-- SatisfactionScore: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- NumberOfAddress: integer (nullable = true)
 |-- Complain: integer (nullable = true)
 |-- OrderAmountHikeFromlastYear: float (nullable = true)
 |-- CouponUsed: float (nullable = true)
 |-- OrderCount: float (nullable = true)
 |-- DaySinceLastOrder: float (nullable = true)
 |-- CashbackAmount: float (nullable = true)



In [73]:
data.show(50)

+----------+-----+------+--------------------+--------+---------------+--------------------+------+--------------+------------------------+------------------+-----------------+-------------+---------------+--------+---------------------------+----------+----------+-----------------+--------------+
|CustomerId|Churn|Tenure|PreferredLoginDevice|CityTier|WarehouseToHome|PreferredPaymentMode|Gender|HourSpendOnApp|NumberOfDeviceRegistered|  PreferedOrderCat|SatisfactionScore|MaritalStatus|NumberOfAddress|Complain|OrderAmountHikeFromlastYear|CouponUsed|OrderCount|DaySinceLastOrder|CashbackAmount|
+----------+-----+------+--------------------+--------+---------------+--------------------+------+--------------+------------------------+------------------+-----------------+-------------+---------------+--------+---------------------------+----------+----------+-----------------+--------------+
|     50001|    1|   4.0|        Mobile Phone|       3|            6.0|          Debit Card|Female|    

In [51]:
pre_process_instance = PreProcess(data)

In [52]:
pre_process_instance.columns_types(data)

{'numeric_cols': ['CustomerId',
  'Churn',
  'Tenure',
  'CityTier',
  'WarehouseToHome',
  'HourSpendOnApp',
  'NumberOfDeviceRegistered',
  'SatisfactionScore',
  'NumberOfAddress',
  'Complain',
  'OrderAmountHikeFromlastYear',
  'CouponUsed',
  'OrderCount',
  'DaySinceLastOrder',
  'CashbackAmount'],
 'categorical_cols': ['PreferredLoginDevice',
  'PreferredPaymentMode',
  'Gender',
  'PreferedOrderCat',
  'MaritalStatus'],
 'boolean_cols': [],
 'array_cols': [],
 'struct_cols': [],
 'unknown_cols': []}

In [11]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="Churn", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.

In [12]:
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

IllegalArgumentException: features does not exist. Available: customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Churn

In [6]:



# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[2]
# summary only
print(treeModel)

Py4JJavaError: An error occurred while calling o32.fit.
: org.apache.spark.SparkException: Input column label does not exist.
	at org.apache.spark.ml.feature.StringIndexerBase.$anonfun$validateAndTransformSchema$2(StringIndexer.scala:128)
	at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:293)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.flatMap(TraversableLike.scala:293)
	at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:290)
	at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema(StringIndexer.scala:123)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema$(StringIndexer.scala:115)
	at org.apache.spark.ml.feature.StringIndexer.validateAndTransformSchema(StringIndexer.scala:145)
	at org.apache.spark.ml.feature.StringIndexer.transformSchema(StringIndexer.scala:252)
	at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:71)
	at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:237)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
