In [25]:
from pyspark.sql import SparkSession, functions
spark = SparkSession.builder.appName('HW5').getOrCreate()

In [31]:
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression

In [2]:
# Python version 
import sys
print(sys.version)

3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]


In [5]:
%%time
df = spark.read.csv('eCommerce.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Unnamed: 0: integer (nullable = true)
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)

CPU times: user 4.36 ms, sys: 3.2 ms, total: 7.56 ms
Wall time: 17 s


In [8]:
%%time
# Step 2: Prepare data for classification.

# Create a new column which just uses first word before ‘.’ in category_code.
split_col = functions.split(df['category_code'], '\.')
df_new = df.withColumn('category', split_col.getItem(0))

CPU times: user 1.4 ms, sys: 513 µs, total: 1.91 ms
Wall time: 191 ms


In [17]:
%%time
# Step 2: Prepare data for classification.

# Use category_code, brand, and price as input variables.
# Use event type as target variable.
dat = df_new[['category', 'brand', 'price', 'event_type']]

CPU times: user 4.63 ms, sys: 2.97 ms, total: 7.61 ms
Wall time: 58.4 ms


In [18]:
dat.show()

+-----------+---------+-------+----------+
|   category|    brand|  price|event_type|
+-----------+---------+-------+----------+
|electronics|   huawei| 270.42|      view|
|electronics|    yasin| 359.08|      view|
| appliances|  almacom| 180.16|      view|
|    apparel|  respect|  44.79|      view|
|electronics|  samsung| 150.95|      view|
|electronics|   xiaomi|  98.51|      cart|
|electronics|    apple| 180.13|      view|
|electronics|    casio|   27.0|      view|
|electronics|  samsung| 385.85|      view|
|electronics|  samsung| 396.15|      view|
|  computers|     asus| 357.25|      view|
|  computers|     acer| 437.57|      view|
|electronics|    artel| 231.64|      view|
|       kids|   chicco| 141.55|      view|
|  furniture|      brw| 387.14|      view|
| appliances|    artel|  93.62|      view|
|electronics|  samsung| 130.99|      view|
|electronics|     oris|1891.94|      view|
|electronics|changhong| 359.37|      view|
| appliances|  gorenje| 373.21|      view|
+----------

In [13]:
%%time

# Step 2: Prepare data for classification.

# Change categorical variables to numerical variables if necessary by using one-hot encoding. 
categoricalColumns = ['category', 'brand']
stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol = 'event_type', outputCol = 'label')
stages += [label_stringIdx]

numericCols = ['price'] # Numerical Features
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]


CPU times: user 6.61 ms, sys: 5.99 ms, total: 12.6 ms
Wall time: 66 ms


In [19]:
%%time
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(dat)
dat_pipe = pipelineModel.transform(dat)
selectedCols = ['label', 'features'] + dat.columns
dat_select = dat_pipe.select(selectedCols)
dat_select.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- category: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- event_type: string (nullable = true)

CPU times: user 73.5 ms, sys: 21.4 ms, total: 94.9 ms
Wall time: 42.3 s


In [34]:
%%time
train, test = dat_select.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 6137234
Test Dataset Count: 2627771
CPU times: user 20.4 ms, sys: 22.1 ms, total: 42.5 ms
Wall time: 2min 14s


In [35]:
%%time

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)

CPU times: user 32.4 ms, sys: 23.6 ms, total: 56 ms
Wall time: 1min 57s


In [37]:
predictions = lrModel.transform(test)
predictions.select('category', 'brand', 'price', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

+-----------+-------+-----+-----+--------------------+----------+--------------------+
|   category|  brand|price|label|       rawPrediction|prediction|         probability|
+-----------+-------+-----+-----+--------------------+----------+--------------------+
|electronics|samsung| 5.64|  0.0|[1.81796790662471...|       0.0|[0.87965783234372...|
|electronics|samsung| 5.64|  0.0|[1.81796790662471...|       0.0|[0.87965783234372...|
|electronics|samsung| 6.16|  0.0|[1.81824177641175...|       0.0|[0.87970283194473...|
|electronics|samsung| 6.22|  0.0|[1.81827337677180...|       0.0|[0.87970802325653...|
|electronics|samsung| 6.67|  0.0|[1.81851037947213...|       0.0|[0.87974695183188...|
|electronics|samsung| 6.67|  0.0|[1.81851037947213...|       0.0|[0.87974695183188...|
|electronics|samsung| 6.67|  0.0|[1.81851037947213...|       0.0|[0.87974695183188...|
|electronics|samsung| 6.67|  0.0|[1.81851037947213...|       0.0|[0.87974695183188...|
|electronics|samsung| 6.67|  0.0|[1.8185103

In [44]:
%%time
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.6959880962948346
CPU times: user 12.4 ms, sys: 10.1 ms, total: 22.5 ms
Wall time: 56.3 s


In [None]:
%%time
# Step 3: Use Logistic Regression to perform classification on event type (view, cart, purchase).

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)

predictions = lrModel.transform(test)
predictions.select('category', 'brand', 'price', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))

In [45]:
%%time
# Step 3: Use decision tree to perform classification on event type (view, cart, purchase).

from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)
predictions_dt = dtModel.transform(test)
predictions_dt.select('category', 'brand', 'price', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

evaluator_dt = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "areaUnderROC"})))

+-----------+-------+-----+-----+--------------------+----------+--------------------+
|   category|  brand|price|label|       rawPrediction|prediction|         probability|
+-----------+-------+-----+-----+--------------------+----------+--------------------+
|electronics|samsung| 5.64|  0.0|[5823074.0,186861...|       0.0|[0.94881081607773...|
|electronics|samsung| 5.64|  0.0|[5823074.0,186861...|       0.0|[0.94881081607773...|
|electronics|samsung| 6.16|  0.0|[5823074.0,186861...|       0.0|[0.94881081607773...|
|electronics|samsung| 6.22|  0.0|[5823074.0,186861...|       0.0|[0.94881081607773...|
|electronics|samsung| 6.67|  0.0|[5823074.0,186861...|       0.0|[0.94881081607773...|
|electronics|samsung| 6.67|  0.0|[5823074.0,186861...|       0.0|[0.94881081607773...|
|electronics|samsung| 6.67|  0.0|[5823074.0,186861...|       0.0|[0.94881081607773...|
|electronics|samsung| 6.67|  0.0|[5823074.0,186861...|       0.0|[0.94881081607773...|
|electronics|samsung| 6.67|  0.0|[5823074.0

In [46]:
%%time
# Step 3: Use random forest to perform classification on event type (view, cart, purchase). 

from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions_rf = rfModel.transform(test)
predictions_rf.select('category', 'brand', 'price', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

evaluator_rf = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator_rf.evaluate(predictions, {evaluator_rf.metricName: "areaUnderROC"})))

+-----------+-------+-----+-----+--------------------+----------+--------------------+
|   category|  brand|price|label|       rawPrediction|prediction|         probability|
+-----------+-------+-----+-----+--------------------+----------+--------------------+
|electronics|samsung| 5.64|  0.0|[18.9588981235038...|       0.0|[0.94794490617519...|
|electronics|samsung| 5.64|  0.0|[18.9588981235038...|       0.0|[0.94794490617519...|
|electronics|samsung| 6.16|  0.0|[18.9588981235038...|       0.0|[0.94794490617519...|
|electronics|samsung| 6.22|  0.0|[18.9588981235038...|       0.0|[0.94794490617519...|
|electronics|samsung| 6.67|  0.0|[18.9588981235038...|       0.0|[0.94794490617519...|
|electronics|samsung| 6.67|  0.0|[18.9588981235038...|       0.0|[0.94794490617519...|
|electronics|samsung| 6.67|  0.0|[18.9588981235038...|       0.0|[0.94794490617519...|
|electronics|samsung| 6.67|  0.0|[18.9588981235038...|       0.0|[0.94794490617519...|
|electronics|samsung| 6.67|  0.0|[18.958898