In [3]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DT').getOrCreate()

In [6]:
df = spark.read.csv("../Data/Ecommerce_Customers.csv", header=True, inferSchema=True)
df.limit(10).toPandas()

Unnamed: 0,Email,Address,Avg Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,mstephenson@fernandez.com,"835 Frank TunnelWrightmouth, MI 82180-9605",34.497268,12.655651,39.577668,4.082621,587.951054
1,hduke@hotmail.com,"4547 Archer CommonDiazchester, CA 06566-8576",31.926272,11.109461,37.268959,2.664034,392.204933
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582Cobbborough, DC ...",33.000915,11.330278,37.110597,4.104543,487.547505
3,riverarebecca@gmail.com,"1414 David ThroughwayPort Jason, OH 22070-1220",34.305557,13.717514,36.721283,3.120179,581.852344
4,mstephens@davidson-herman.com,"14023 Rodriguez PassagePort Jacobville, PR 372...",33.330673,12.795189,37.536653,4.446308,599.406092
5,alvareznancy@lucas.biz,"645 Martha Park Apt. 611Jeffreychester, MN 672...",33.871038,12.026925,34.476878,5.493507,637.102448
6,katherine20@yahoo.com,"68388 Reyes Lights Suite 692Josephbury, WV 922...",32.021595,11.366348,36.683776,4.685017,521.572175
7,awatkins@yahoo.com,Unit 6538 Box 8980DPO AP 09026-4941,32.739143,12.351959,37.373359,4.434273,549.904146
8,vchurch@walter-martinez.com,"860 Lee KeyWest Debra, SD 97450-0495",33.987773,13.386235,37.534497,3.273434,570.200409
9,bonnie69@lin.biz,"PSC 2734, Box 5255APO AA 98456-7482",31.936549,11.814128,37.145168,3.202806,427.199385


In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
vectorassembler = VectorAssembler(inputCols=["Avg Session Length","Time on App","Time on Website","Length of Membership"], outputCol="Independent Features")

In [9]:
vector_df = vectorassembler.transform(df)

In [13]:
vector_df.limit(10).toPandas()

Unnamed: 0,Email,Address,Avg Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent,Independent Features
0,mstephenson@fernandez.com,"835 Frank TunnelWrightmouth, MI 82180-9605",34.497268,12.655651,39.577668,4.082621,587.951054,"[34.49726773, 12.65565115, 39.57766802, 4.0826..."
1,hduke@hotmail.com,"4547 Archer CommonDiazchester, CA 06566-8576",31.926272,11.109461,37.268959,2.664034,392.204933,"[31.92627203, 11.10946073, 37.26895887, 2.6640..."
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582Cobbborough, DC ...",33.000915,11.330278,37.110597,4.104543,487.547505,"[33.00091476, 11.33027806, 37.11059744, 4.1045..."
3,riverarebecca@gmail.com,"1414 David ThroughwayPort Jason, OH 22070-1220",34.305557,13.717514,36.721283,3.120179,581.852344,"[34.30555663, 13.71751367, 36.72128268, 3.1201..."
4,mstephens@davidson-herman.com,"14023 Rodriguez PassagePort Jacobville, PR 372...",33.330673,12.795189,37.536653,4.446308,599.406092,"[33.33067252, 12.79518855, 37.5366533, 4.44630..."
5,alvareznancy@lucas.biz,"645 Martha Park Apt. 611Jeffreychester, MN 672...",33.871038,12.026925,34.476878,5.493507,637.102448,"[33.87103788, 12.02692534, 34.47687763, 5.4935..."
6,katherine20@yahoo.com,"68388 Reyes Lights Suite 692Josephbury, WV 922...",32.021595,11.366348,36.683776,4.685017,521.572175,"[32.0215955, 11.36634831, 36.68377615, 4.68501..."
7,awatkins@yahoo.com,Unit 6538 Box 8980DPO AP 09026-4941,32.739143,12.351959,37.373359,4.434273,549.904146,"[32.73914294, 12.35195897, 37.37335886, 4.4342..."
8,vchurch@walter-martinez.com,"860 Lee KeyWest Debra, SD 97450-0495",33.987773,13.386235,37.534497,3.273434,570.200409,"[33.9877729, 13.38623528, 37.53449734, 3.27343..."
9,bonnie69@lin.biz,"PSC 2734, Box 5255APO AA 98456-7482",31.936549,11.814128,37.145168,3.202806,427.199385,"[31.93654862, 11.81412829, 37.14516822, 3.2028..."


In [15]:
dr = DecisionTreeRegressor(featuresCol='Independent Features', labelCol='Yearly Amount Spent')

In [19]:
train, test = vector_df.randomSplit([0.75, 0.25])

In [20]:
train.count(), test.count()

(392, 108)

In [22]:
train.limit(5).toPandas()

Unnamed: 0,Email,Address,Avg Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent,Independent Features
0,aaron04@yahoo.com,"16338 Scott Corner Suite 727West Alexandra, AR...",33.705113,10.163179,37.763041,4.778974,521.24078,"[33.7051128, 10.16317906, 37.76304108, 4.77897..."
1,aaron22@gmail.com,"38678 Sean Drive Suite 293Karentown, IA 78306-...",33.452295,12.005916,36.534096,4.712234,576.477607,"[33.45229528, 12.00591637, 36.53409567, 4.7122..."
2,aaron89@gmail.com,"0128 Sampson Loop Suite 943Hoffmanton, MO 02122",31.447446,10.101632,38.043453,4.238296,418.602742,"[31.44744649, 10.1016322, 38.04345265, 4.23829..."
3,acampbell@sanchez-velasquez.info,"5791 Jessica CoveMckinneyborough, OK 64460-7536",32.425697,11.448902,37.58019,2.586968,420.737673,"[32.42569728, 11.44890154, 37.58019043, 2.5869..."
4,acontreras@hotmail.com,"88995 Edwards Row Suite 456North Jo, DE 02062-...",33.547748,10.735363,37.458375,3.863425,476.191413,"[33.54774794, 10.73536292, 37.45837473, 3.8634..."


In [23]:
dr_model = dr.fit(train)

In [25]:
dr_predictions = dr_model.transform(test)

In [28]:
dr_predictions.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 'Independent Features',
 'prediction']

In [29]:
dr_predictions[["Independent Features", "Yearly Amount Spent", "prediction"]].show(10)

+--------------------+-------------------+------------------+
|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[32.44952156,13.4...|        503.9783791|483.46935307105264|
|[32.84879283,10.9...|        404.8245289| 435.4501710944445|
|[33.87103788,12.0...|        637.1024479|       744.2218671|
|[33.9252966,11.58...|         483.673308| 435.4501710944445|
|[32.13386241,11.6...|        443.4418601| 435.4501710944445|
|[33.78015676,11.9...|        518.7864831| 528.8476896161291|
|[32.4914466,12.53...|        449.0703194|439.35942210666667|
|[33.56647439,12.2...|        466.4211988| 494.3016591357144|
|[33.61601855,13.5...|        611.0000251|  599.443741071429|
|[33.81173341,11.1...|        535.3216101| 568.0802217055555|
+--------------------+-------------------+------------------+
only showing top 10 rows



In [30]:
dr_evaluator = RegressionEvaluator(
    labelCol="Yearly Amount Spent", predictionCol="prediction", metricName="rmse")

rmse = dr_evaluator.evaluate(dr_predictions)

print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 40.4622
