Predict whether a user who views or adds a product to the cart will proceed to purchase it.

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.find_spark_home import _find_spark_home
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
import pyspark.sql.functions as F
import os
from pyspark.sql.functions import col, count, isnan, avg, max, row_number
from pyspark.sql.window import Window
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import when

from dotenv import load_dotenv
load_dotenv()

print(_find_spark_home())

C:\Users\anama\anaconda3\envs\Bigdata\Lib\site-packages\pyspark


In [2]:
python_path = os.environ.get('PYTHON_PATH')
app_name = os.environ.get('APP_NAME_DEC')
hadoop_path_dec = os.environ.get('HADOOP_DEC_DATASET_PATH')
hadoop_path_nov = os.environ.get('HADOOP_NOV_DATASET_PATH')
hadoop_path = os.environ.get('HADOOP_DATASET_PATH')

In [3]:
conf = SparkConf()\
    .setMaster('local[*]')\
    .set('spark-local-dir', "C:\\spark-temp")\
    .set('spark.driver.memory', '4g')\
    .set('spark.executor.memory', '4g')\
    .set('spark.driver.maxResultSize', '2g')\
    .set('spark.pyspark.python', python_path)\
    .set('spark.pyspark.driver.python', python_path)\
    .set("spark.network.timeout","800s")\
    .set("spark.dynamicAllocation.enabled", "true")\
    .set("spark.shuffle.service.enabled", "true")\
    .set("spark.dynamicAllocation.minExecutors", "1")\
    .set("spark.dynamicAllocation.maxExecutors", "10")\
    .set("spark.dynamicAllocation.executorIdleTimeout", "60s")\

spark = SparkSession.builder.appName(app_name).config(conf=conf).getOrCreate()
sc = spark.sparkContext

for item in sc.getConf().getAll(): print(item)

('spark.executor.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false')
('spark.dynamicAllocation.minExecutors', '1')
('spark.shuffle.service.enabled', 'true')
('spark.driver.memory', '4g')
('spark.executor.memory', '4g')
('spar

In [None]:
df_dec = spark.read.csv(hadoop_path_dec+'/selected_Data/', header = True, inferSchema = True)
df_dec.show()

In [None]:
df_nov = spark.read.csv(hadoop_path_nov + '/selected_Data_Nov/', header = True, inferSchema = True)
df_nov.show()

In [None]:
df_combined = df_nov.union(df_dec)
df_combined.show()

In [None]:
save_path = hadoop_path + '/combined_Data/' 

df_combined.repartition(8).write.mode('overwrite').option("header", "true").csv(save_path)


In [4]:
df = spark.read.csv(hadoop_path + '/combined_Data/', header = True, inferSchema = True)
df.show()

+------------+----------+----------+--------+-------+---------+------------+------------------+---+
|          id|event_type|product_id|   brand|  price|  user_id|    category|           product|day|
+------------+----------+----------+--------+-------+---------+------------+------------------+---+
|257698236188|      view|   3600287| indesit| 205.15|546669693|  appliances|    kitchen.washer| 15|
|240518268935|      view|   4803585|  huawei| 115.55|514179359| electronics|   audio.headphone| 15|
|549756563931|      view|   4600542|    beko| 357.72|570790007|  appliances|kitchen.dishwasher| 29|
|412317698870|      view|  12301593|    zubr|  21.56|516315517|construction|       tools.drill| 19|
|257698481187|      view|   1005135|   apple|1644.08|571381066| electronics|        smartphone| 15|
| 94489540344|      view|   1002100| samsung| 370.64|560874417| electronics|        smartphone|  7|
|      629235|      view|  17000006|      sv| 157.79|534873586|   computers|           desktop|  1|


In [5]:
df = df.withColumn("is_purchased", when(df["event_type"] == "purchase", 1).otherwise(0))

In [6]:
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
encoder = OneHotEncoder(inputCols=["categoryIndex"], outputCols=["categoryVec"])
assembler = VectorAssembler(inputCols=["categoryVec", "price", "day"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="is_purchased")

In [7]:
pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler, lr])

In [8]:
(train, test) = df.randomSplit([0.8, 0.2])

In [9]:
model = pipeline.fit(train)

In [10]:
transformed_data = model.transform(test)

transformed_data.show(truncate=False)


+----+----------+----------+---------+-------+---------+------------+----------------------+---+------------+-------------+--------------+--------------------------------+---------------------------------------------------------------------------+----------------------------------------+-----------------------------------------+----------+
|id  |event_type|product_id|brand    |price  |user_id  |category    |product               |day|is_purchased|categoryIndex|categoryVec   |features                        |scaledFeatures                                                             |rawPrediction                           |probability                              |prediction|
+----+----------+----------+---------+-------+---------+------------+----------------------+---+------------+-------------+--------------+--------------------------------+---------------------------------------------------------------------------+----------------------------------------+----------------------------

In [11]:
predictions = model.transform(test)

In [12]:
evaluator = BinaryClassificationEvaluator(labelCol="is_purchased")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: ", accuracy)

Accuracy:  0.6088109561002291


 Predict the likelihood of a user preferring a specific brand

In [5]:
brand_interaction_count = df.groupBy("user_id", "brand").count().withColumnRenamed("count", "interaction_count")
brand_interaction_count.show()

+---------+-------+-----------------+
|  user_id|  brand|interaction_count|
+---------+-------+-----------------+
|559155330|samsung|               45|
|547386228|samsung|                4|
|514936996| xiaomi|               20|
|516073762|samsung|               94|
|516791031| xiaomi|               47|
|571948641|   acer|                4|
|529471190|  yasin|                4|
|546784852|   oppo|                8|
|536583301|    bts|                5|
|563628907|samsung|               22|
|568791517|   alis|                7|
|541186183|karcher|               26|
|535122525|samsung|                1|
|575936496| armani|                4|
|571734181|respect|               18|
|549049574|samsung|               79|
|566806442|  apple|                4|
|512943410|  hansa|               28|
|537601990|respect|               44|
|512403653|samsung|                4|
+---------+-------+-----------------+
only showing top 20 rows



In [6]:
brand_purchase_count = df.filter(col("event_type") == "purchase").groupBy("user_id", "brand").count().withColumnRenamed("count", "purchase_count")
brand_purchase_count.show()

+---------+---------+--------------+
|  user_id|    brand|purchase_count|
+---------+---------+--------------+
|575936496|   armani|             1|
|556849764|   xiaomi|             6|
|514784102|  neoline|             1|
|514550893|    apple|             4|
|547985627|  samsung|             5|
|568953750|  samsung|             1|
|553678747|  redmond|             1|
|555187597|    apple|             1|
|571692512|  samsung|             1|
|565961604|  samsung|            15|
|512445573|   xiaomi|             1|
|552207644|    apple|            22|
|556703732|   huawei|             3|
|571135032|  samsung|             7|
|513387648|  samsung|             5|
|512791089|prestigio|             1|
|513031081|    apple|            13|
|558952612|     acer|             2|
|516462649|    braun|             1|
|563885719|   xiaomi|             9|
+---------+---------+--------------+
only showing top 20 rows



In [7]:
# Join interaction count back to the original DataFrame
df = df.join(brand_interaction_count, ["user_id", "brand"], "left")

# Join purchase count back to the original DataFrame
df = df.join(brand_purchase_count, ["user_id", "brand"], "left")

df.show()

+---------+--------+------------+----------+----------+-------+------------+--------------------+---+-----------------+--------------+
|  user_id|   brand|          id|event_type|product_id|  price|    category|             product|day|interaction_count|purchase_count|
+---------+--------+------------+----------+----------+-------+------------+--------------------+---+-----------------+--------------+
|512424032|  huawei|223338761477|      cart|   1004708| 154.18| electronics|          smartphone| 15|               57|          null|
|512518406|logitech|377958041462|      view|  11600352| 685.97|   computers|             desktop| 17|                5|          null|
|512596849|   meizu|412317751484|      view|   1005278| 205.67| electronics|          smartphone| 19|                7|          null|
|512609289|   apple| 42950045881|      view|   1003316|1027.05|construction|         tools.light|  4|              119|             3|
|512831993|   simax|309237818886|      view|  16400233|

In [8]:
df = df.na.fill({'purchase_count': 0})

In [9]:
avg_price_interaction = df.groupBy("user_id", "brand").agg(avg("price").alias("avg_price_per_brand"))
df = df.join(avg_price_interaction, ["user_id", "brand"], "left")
df.show()

+---------+----------+------------+----------+----------+-------+------------+--------------------+---+-----------------+--------------+-------------------+
|  user_id|     brand|          id|event_type|product_id|  price|    category|             product|day|interaction_count|purchase_count|avg_price_per_brand|
+---------+----------+------------+----------+----------+-------+------------+--------------------+---+-----------------+--------------+-------------------+
|512596849|     meizu|412317751484|      view|   1005278| 205.67| electronics|          smartphone| 19|                7|             0| 227.28285714285715|
|512998624|  kaemingk|369367870544|      view|  10600284| 296.53| electronics|              clocks| 21|                1|             0|             296.53|
|514639724|     apple|352188041912|      view|   1004249| 766.82|construction|         tools.light| 20|                5|             0|  916.9320000000001|
|516428845|        lg|515396954940|      view|   3700127| 

In [10]:
recency_interaction = df.groupBy("user_id", "brand").agg(max("day").alias("last_interaction_day"))
df = df.join(recency_interaction, ["user_id", "brand"], "left")
df.show()

+---------+----------+------------+----------+----------+-------+------------+--------------------+---+-----------------+--------------+-------------------+--------------------+
|  user_id|     brand|          id|event_type|product_id|  price|    category|             product|day|interaction_count|purchase_count|avg_price_per_brand|last_interaction_day|
+---------+----------+------------+----------+----------+-------+------------+--------------------+---+-----------------+--------------+-------------------+--------------------+
|512596849|     meizu|412317751484|      view|   1005278| 205.67| electronics|          smartphone| 19|                7|             0| 227.28285714285715|                  19|
|512998624|  kaemingk|369367870544|      view|  10600284| 296.53| electronics|              clocks| 21|                1|             0|             296.53|                  21|
|514639724|     apple|352188041912|      view|   1004249| 766.82|construction|         tools.light| 20|       

In [11]:
df = df.na.fill({
    'purchase_count': 0,
    'interaction_count': 0,
    'avg_price_per_brand': 0,
    'last_interaction_day': 0  
})
df.show()

+---------+----------+------------+----------+----------+-------+------------+--------------------+---+-----------------+--------------+-------------------+--------------------+
|  user_id|     brand|          id|event_type|product_id|  price|    category|             product|day|interaction_count|purchase_count|avg_price_per_brand|last_interaction_day|
+---------+----------+------------+----------+----------+-------+------------+--------------------+---+-----------------+--------------+-------------------+--------------------+
|512596849|     meizu|412317751484|      view|   1005278| 205.67| electronics|          smartphone| 19|                7|             0| 227.28285714285715|                  19|
|512998624|  kaemingk|369367870544|      view|  10600284| 296.53| electronics|              clocks| 21|                1|             0|             296.53|                  21|
|514639724|     apple|352188041912|      view|   1004249| 766.82|construction|         tools.light| 20|       

In [15]:
# Encode categorical variables
brand_indexer = StringIndexer(inputCol="brand", outputCol="brandIndex")
encoder = OneHotEncoder(inputCols=["brandIndex"], outputCols=["brandVec"])
df = encoder.fit(df).transform(df)
df.show()

+---------+---------+------------+----------+----------+------+------------+------------------+---+-----------------+--------------+-------------------+--------------------+----------+------------------+
|  user_id|    brand|          id|event_type|product_id| price|    category|           product|day|interaction_count|purchase_count|avg_price_per_brand|last_interaction_day|brandIndex|          brandVec|
+---------+---------+------------+----------+----------+------+------------+------------------+---+-----------------+--------------+-------------------+--------------------+----------+------------------+
| 86517859|    tefal| 77310130237|      view|  15900313|  8.98|construction|   tools.generator|  6|                1|             0|               8.98|                   6|      20.0| (4289,[20],[1.0])|
|170313967|moldabela|326417765410|      view|  28100834| 89.84|       sport|           bicycle| 19|                3|             0| 112.48666666666666|                  19|     702.0|

In [17]:
save_path = hadoop_path_dec + '/SpecificBrand_Data/' 

df.repartition(8).write.mode('overwrite').option("header", "true").parquet(save_path)


In [18]:
# Assemble all features
assembler = VectorAssembler(
    inputCols=[
        "interaction_count",
        "purchase_count",
        "avg_price_per_brand",
        "brandVec",
        "last_interaction_day",  # Include the day as a feature
        # Add other features here as you create them
    ],
    outputCol="features"
)
df = assembler.transform(df)

In [19]:
windowSpec = Window.partitionBy("user_id").orderBy(col("purchase_count").desc())

# Add a row number for each user based on purchase count (highest purchase count gets row_number = 1)
df = df.withColumn("preference_rank", row_number().over(windowSpec))

# Define the target variable as 1 for the most preferred brand, 0 otherwise
df = df.withColumn("label", when(col("preference_rank") == 1, 1).otherwise(0))

In [None]:
df.show()

In [20]:
train, test = df.randomSplit([0.8, 0.2], seed=42)


In [21]:

rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=10)

model = rf.fit(train)


In [25]:
predictions = model.transform(test)

In [26]:
# Evaluate model using Area Under ROC
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
print("Area Under ROC: ", roc_auc)

Area Under ROC:  0.5
