Predict whether a user who views or adds a product to the cart will proceed to purchase it.

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.find_spark_home import _find_spark_home
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
import pyspark.sql.functions as F
import os
from pyspark.sql.functions import col, count, isnan
from pyspark.sql.functions import when

from dotenv import load_dotenv
load_dotenv()

print(_find_spark_home())

C:\Users\anama\anaconda3\envs\Bigdata\Lib\site-packages\pyspark


In [8]:
python_path = os.environ.get('PYTHON_PATH')
app_name = os.environ.get('APP_NAME')
hadoop_path_dec = os.environ.get('HADOOP_DEC_DATASET_PATH')
hadoop_path_nov = os.environ.get('HADOOP_NOV_DATASET_PATH')

In [9]:
conf = SparkConf()\
    .setMaster('local[*]')\
    .set('spark-local-dir', "C:\\spark-temp")\
    .set('spark.driver.memory', '4g')\
    .set('spark.executor.memory', '4g')\
    .set('spark.driver.maxResultSize', '2g')\
    .set('spark.pyspark.python', python_path)\
    .set('spark.pyspark.driver.python', python_path)\
    .set("spark.network.timeout","800s")\
    .set("spark.dynamicAllocation.enabled", "true")\
    .set("spark.shuffle.service.enabled", "true")\
    .set("spark.dynamicAllocation.minExecutors", "1")\
    .set("spark.dynamicAllocation.maxExecutors", "10")\
    .set("spark.dynamicAllocation.executorIdleTimeout", "60s")\

spark = SparkSession.builder.appName(app_name_dec).config(conf=conf).getOrCreate()
sc = spark.sparkContext

for item in sc.getConf().getAll(): print(item)

('spark.dynamicAllocation.minExecutors', '1')
('spark.driver.memory', '4g')
('spark.dynamicAllocation.maxExecutors', '10')
('spark.driver.host', 'BOOK-G6MUSCB12M.local')
('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHan

In [4]:
df = spark.read.csv(hadoop_path_dec+'/selected_Data/', header = True, inferSchema = True)
df.show()

+------+----------+--------+-------+---------+------------+--------------------+---+
|    id|event_type|   brand|  price|  user_id|    category|             product|day|
+------+----------+--------+-------+---------+------------+--------------------+---+
|664791|      cart|   apple|1436.33|513035930|construction|         tools.light|  1|
|376610|      view|   midea| 252.23|512511396|  appliances|kitchen.refrigera...|  1|
|546957|      view|   apple| 532.83|560488744|construction|         tools.light|  1|
|383524|      view|    akom|   79.8|514686387|   furniture|         bedroom.bed|  1|
|276229|      view|    sony|  30.09|514479293|       sport|             bicycle|  1|
|897219|      cart|   apple|1621.41|522181460|construction|         tools.light|  1|
|390469|      view|    beko| 180.16|570967288|  appliances|      kitchen.washer|  1|
|508720|      view|      hp| 567.58|548461791|   computers|             desktop|  1|
|519760|      view|starline| 145.18|513621867|     apparel|      

In [61]:
df = df.withColumn("is_purchased", when(df["event_type"] == "purchase", 1).otherwise(0))

In [62]:
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
encoder = OneHotEncoder(inputCols=["categoryIndex"], outputCols=["categoryVec"])
assembler = VectorAssembler(inputCols=["categoryVec", "price", "day"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="is_purchased")

In [63]:
pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler, lr])


In [64]:
(train, test) = df.randomSplit([0.8, 0.2])

In [65]:
model = pipeline.fit(train)

In [68]:
# Assuming 'model' is your fitted pipeline model and 'test' is your dataset
transformed_data = model.transform(test)

transformed_data.show(truncate=False)


+----+----------+--------------+-------+---------+------------+---------------------+---+------------+-------------+---------------+--------------------------------+----------------------------------------------------------------------------+----------------------------------------+-----------------------------------------+----------+
|id  |event_type|brand         |price  |user_id  |category    |product              |day|is_purchased|categoryIndex|categoryVec    |features                        |scaledFeatures                                                              |rawPrediction                           |probability                              |prediction|
+----+----------+--------------+-------+---------+------------+---------------------+---+------------+-------------+---------------+--------------------------------+----------------------------------------------------------------------------+----------------------------------------+-----------------------------------------+-

In [66]:
predictions = model.transform(test)

In [67]:
evaluator = BinaryClassificationEvaluator(labelCol="is_purchased")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: ", accuracy)

Accuracy:  0.6159275035417607


 Predict the likelihood of a user preferring a specific brand

In [70]:
brand_interaction_count = df.groupBy("user_id", "brand").count().withColumnRenamed("count", "interaction_count")
brand_interaction_count.show()

+---------+---------+-----------------+
|  user_id|    brand|interaction_count|
+---------+---------+-----------------+
|580199883|  samsung|                3|
|513104299|      jbl|                1|
|562615967|    vitek|                5|
|516418740|panasonic|                5|
|520237416|    lider|                1|
|526036450|     gree|                4|
|542361644|   xiaomi|               13|
|560140328|    meizu|               15|
|513240682|    bosch|                7|
|513575107|   dewalt|              112|
|537927007|   xiaomi|                6|
|523118037|  samsung|               14|
|531612957|       jb|                1|
|547697192|  philips|                8|
|525761373|    apple|                4|
|568020125|   birusa|               18|
|545610907|    artel|               17|
|539398143|    braun|                2|
|570380344|  samsung|                7|
|516676895| scarlett|                6|
+---------+---------+-----------------+
only showing top 20 rows



In [71]:
brand_purchase_count = df.filter(col("event_type") == "purchase").groupBy("user_id", "brand").count().withColumnRenamed("count", "purchase_count")
brand_purchase_count.show()

+---------+--------+--------------+
|  user_id|   brand|purchase_count|
+---------+--------+--------------+
|526511588| samsung|             2|
|580240973|    oppo|             3|
|580108945|   apple|             2|
|544821740|  xiaomi|             2|
|560479699|  huawei|             3|
|538718612|  xiaomi|             5|
|513829359| carhome|             1|
|516228299|   apple|             2|
|573944006| samsung|            71|
|514032415|    sven|             1|
|513195085|defender|             1|
|539298497| lucente|             3|
|531505750|   apple|             1|
|559715715| brateck|             2|
|513721454| indesit|             5|
|520619833|  xiaomi|             1|
|578061599|  rainbo|             4|
|535788598|   apple|             3|
|522490218| samsung|             1|
|581213870|    acer|             3|
+---------+--------+--------------+
only showing top 20 rows



In [72]:
# Join interaction count back to the original DataFrame
df = df.join(brand_interaction_count, ["user_id", "brand"], "left")

# Join purchase count back to the original DataFrame
df = df.join(brand_purchase_count, ["user_id", "brand"], "left")

df.show()

+---------+--------+------------+----------+-------+------------+--------------------+---+------------+-----------------+--------------+
|  user_id|   brand|          id|event_type|  price|    category|             product|day|is_purchased|interaction_count|purchase_count|
+---------+--------+------------+----------+-------+------------+--------------------+---+------------+-----------------+--------------+
|512542830| samsung|171798739294|      view| 1158.0|construction|         tools.light| 12|           0|               47|          null|
|513238350|  lenovo|171799261463|      view|  252.0| electronics|     audio.headphone| 12|           0|                2|          null|
|513439286| neoline|      278641|      view| 411.83| accessories|              wallet|  1|           0|                1|          null|
|513751947|   apple|523986081652|      view| 463.07| electronics|              clocks| 28|           0|                9|          null|
|513860462| samsung|      625362|      vi

In [73]:
df = df.na.fill({'purchase_count': 0})
