In [1]:
from pyspark.find_spark_home import _find_spark_home
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
from pyspark.sql.functions import col, split, expr
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
import os

from dotenv import load_dotenv
load_dotenv()

print(_find_spark_home())

C:\Users\liisu\anaconda3\envs\bigData\Lib\site-packages\pyspark


In [3]:
python_path = os.environ.get('PYTHON_PATH')
app_name_dec = os.environ.get('APP_NAME_DEC')
hadoop_path_dec = os.environ.get('HADOOP_DEC_DATASET_PATH')

In [4]:
conf = SparkConf()\
    .setMaster('local[*]')\
    .set('spark-local-dir', "C:\\spark-temp")\
    .set('spark.driver.memory', '4g')\
    .set('spark.executor.memory', '4g')\
    .set('spark.driver.maxResultSize', '2g')\
    .set('spark.pyspark.python', python_path)\
    .set('spark.pyspark.driver.python', python_path)\
    .set("spark.network.timeout","800s")\
    .set("spark.dynamicAllocation.enabled", "true")\
    .set("spark.shuffle.service.enabled", "true")\
    .set("spark.dynamicAllocation.minExecutors", "1")\
    .set("spark.dynamicAllocation.maxExecutors", "10")\
    .set("spark.dynamicAllocation.executorIdleTimeout", "60s")\

spark = SparkSession.builder.appName(app_name_dec).config(conf=conf).getOrCreate()
sc = spark.sparkContext

for item in sc.getConf().getAll(): print(item)

('spark.driver.host', 'host.docker.internal')
('spark.dynamicAllocation.minExecutors', '1')
('spark.shuffle.service.enabled', 'true')
('spark.app.submitTime', '1703861345037')
('spark.driver.memory', '4g')
('spark.executor.memory', '4g')
('spark.app.startTime', '1703861345350')
('spark.dynamicAllocation.maxExecutors', '10')
('spark.executor.id', 'driver')
('spark.pyspark.driver.python', 'C:/Users/liisu/anaconda3/envs/bigData/python.exe')
('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.

In [5]:
df = spark.read.csv(hadoop_path_dec, header = True, inferSchema = True)
df.show()

+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|2019-12-01 01:00:00|      view|   1005105|2232732093077520756|construction.tool...|  apple|1302.48|556695836|ca5eefc5-11f9-450...|
|2019-12-01 01:00:00|      view|  22700068|2232732091643068746|                null|  force| 102.96|577702456|de33debe-c7bf-44e...|
|2019-12-01 01:00:01|      view|   2402273|2232732100769874463|appliances.person...|  bosch| 313.52|539453785|5ee185a7-0689-4a3...|
|2019-12-01 01:00:02|  purchase|  26400248|2053013553056579841|computers.periphe...|   null| 132.31|535135317|61792a26-672f-4e6...|
|2019-12-01 01:00:02|      view|  20100164|2232732110089618156|    apparel.t

In [6]:
df_index = df.select('*').withColumn('id', F.monotonically_increasing_id())
column_names = ['id'] + [col for col in df.columns]

df_index_first = df_index.select(column_names)
df_index_first.show()

+---+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
| id|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|
+---+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|  0|2019-12-01 01:00:00|      view|   1005105|2232732093077520756|construction.tool...|  apple|1302.48|556695836|ca5eefc5-11f9-450...|
|  1|2019-12-01 01:00:00|      view|  22700068|2232732091643068746|                null|  force| 102.96|577702456|de33debe-c7bf-44e...|
|  2|2019-12-01 01:00:01|      view|   2402273|2232732100769874463|appliances.person...|  bosch| 313.52|539453785|5ee185a7-0689-4a3...|
|  3|2019-12-01 01:00:02|  purchase|  26400248|2053013553056579841|computers.periphe...|   null| 132.31|535135317|61792a26-672f-4e6...|
|  4|2019-12-01 01:00:02|      view|  20100164|2

In [7]:
df_new = df_index_first.drop('user_session', 'category_id')
df_new.show()

+---+-------------------+----------+----------+--------------------+-------+-------+---------+
| id|         event_time|event_type|product_id|       category_code|  brand|  price|  user_id|
+---+-------------------+----------+----------+--------------------+-------+-------+---------+
|  0|2019-12-01 01:00:00|      view|   1005105|construction.tool...|  apple|1302.48|556695836|
|  1|2019-12-01 01:00:00|      view|  22700068|                null|  force| 102.96|577702456|
|  2|2019-12-01 01:00:01|      view|   2402273|appliances.person...|  bosch| 313.52|539453785|
|  3|2019-12-01 01:00:02|  purchase|  26400248|computers.periphe...|   null| 132.31|535135317|
|  4|2019-12-01 01:00:02|      view|  20100164|    apparel.trousers|   nika| 101.68|517987650|
|  5|2019-12-01 01:00:02|      view| 100008256|accessories.umbrella|   ikea| 163.56|542860793|
|  6|2019-12-01 01:00:02|      view|  21400264|  electronics.clocks|   null|  88.81|538021416|
|  7|2019-12-01 01:00:03|      view|   1005239|con

In [8]:
df_new = df_new.withColumn("split_code", split(col("category_code"), "\."))

df_new = df_new.withColumn("category", col("split_code")[0])

df_new = df_new.withColumn("product", expr("substring(category_code, length(category) + 2)"))

df_new =df_new.drop("split_code")

df_new.show(truncate=False)

+---+-------------------+----------+----------+---------------------------------+-------+-------+---------+------------+-----------------------+
|id |event_time         |event_type|product_id|category_code                    |brand  |price  |user_id  |category    |product                |
+---+-------------------+----------+----------+---------------------------------+-------+-------+---------+------------+-----------------------+
|0  |2019-12-01 01:00:00|view      |1005105   |construction.tools.light         |apple  |1302.48|556695836|construction|tools.light            |
|1  |2019-12-01 01:00:00|view      |22700068  |null                             |force  |102.96 |577702456|null        |null                   |
|2  |2019-12-01 01:00:01|view      |2402273   |appliances.personal.massager     |bosch  |313.52 |539453785|appliances  |personal.massager      |
|3  |2019-12-01 01:00:02|purchase  |26400248  |computers.peripherals.printer    |null   |132.31 |535135317|computers   |peripheral

In [9]:
df_new =df_new.drop("category_code")
df_new.show(truncate=False)

+---+-------------------+----------+----------+-------+-------+---------+------------+-----------------------+
|id |event_time         |event_type|product_id|brand  |price  |user_id  |category    |product                |
+---+-------------------+----------+----------+-------+-------+---------+------------+-----------------------+
|0  |2019-12-01 01:00:00|view      |1005105   |apple  |1302.48|556695836|construction|tools.light            |
|1  |2019-12-01 01:00:00|view      |22700068  |force  |102.96 |577702456|null        |null                   |
|2  |2019-12-01 01:00:01|view      |2402273   |bosch  |313.52 |539453785|appliances  |personal.massager      |
|3  |2019-12-01 01:00:02|purchase  |26400248  |null   |132.31 |535135317|computers   |peripherals.printer    |
|4  |2019-12-01 01:00:02|view      |20100164  |nika   |101.68 |517987650|apparel     |trousers               |
|5  |2019-12-01 01:00:02|view      |100008256 |ikea   |163.56 |542860793|accessories |umbrella               |
|

In [10]:
df_new = df_new.withColumn("date", split(col("event_time"), " ")[0])
df_new = df_new.withColumn("day", split(col("date"), "-")[2].cast(IntegerType()))
df_new = df_new.drop("event_time")

df_new.show()

+---+----------+----------+-------+-------+---------+------------+--------------------+----------+---+
| id|event_type|product_id|  brand|  price|  user_id|    category|             product|      date|day|
+---+----------+----------+-------+-------+---------+------------+--------------------+----------+---+
|  0|      view|   1005105|  apple|1302.48|556695836|construction|         tools.light|2019-12-01|  1|
|  1|      view|  22700068|  force| 102.96|577702456|        null|                null|2019-12-01|  1|
|  2|      view|   2402273|  bosch| 313.52|539453785|  appliances|   personal.massager|2019-12-01|  1|
|  3|  purchase|  26400248|   null| 132.31|535135317|   computers| peripherals.printer|2019-12-01|  1|
|  4|      view|  20100164|   nika| 101.68|517987650|     apparel|            trousers|2019-12-01|  1|
|  5|      view| 100008256|   ikea| 163.56|542860793| accessories|            umbrella|2019-12-01|  1|
|  6|      view|  21400264|   null|  88.81|538021416| electronics|       

In [11]:
df_new = df_new.drop("date")
df_new.show()

+---+----------+----------+-------+-------+---------+------------+--------------------+---+
| id|event_type|product_id|  brand|  price|  user_id|    category|             product|day|
+---+----------+----------+-------+-------+---------+------------+--------------------+---+
|  0|      view|   1005105|  apple|1302.48|556695836|construction|         tools.light|  1|
|  1|      view|  22700068|  force| 102.96|577702456|        null|                null|  1|
|  2|      view|   2402273|  bosch| 313.52|539453785|  appliances|   personal.massager|  1|
|  3|  purchase|  26400248|   null| 132.31|535135317|   computers| peripherals.printer|  1|
|  4|      view|  20100164|   nika| 101.68|517987650|     apparel|            trousers|  1|
|  5|      view| 100008256|   ikea| 163.56|542860793| accessories|            umbrella|  1|
|  6|      view|  21400264|   null|  88.81|538021416| electronics|              clocks|  1|
|  7|      view|   1005239| xiaomi| 256.38|525740700|construction|         tools

In [12]:
df_new = df_new.dropDuplicates()

In [13]:
df_new = df_new.dropna()
df_new.show()

+----+----------+----------+-------+-------+---------+------------+--------------------+---+
|  id|event_type|product_id|  brand|  price|  user_id|    category|             product|day|
+----+----------+----------+-------+-------+---------+------------+--------------------+---+
|  21|      view|   1005115|  apple|  912.5|553704027|construction|         tools.light|  1|
|  50|      view|   7203227|   lego|  25.48|560748284|   furniture|   living_room.chair|  1|
| 216|      view|   1005119|  apple| 994.88|579969045|construction|         tools.light|  1|
| 249|      view|   6400336|  intel| 376.59|568818544|     apparel|      shoes.step_ins|  1|
| 467|      view|  48600038|  bosch| 191.25|513069091| electronics|              clocks|  1|
| 968|      view|  28719425|  baden|  62.81|545223467|     apparel|               shoes|  1|
| 981|      view|  10701101|     ea|  48.37|518013061|     apparel|               scarf|  1|
|1572|      view|  28714289|   nike| 144.12|514778642|     apparel|   

In [14]:
print("Null values present in:")
for c in ["category","product","day", "brand"]:
    print(c +':', df_new.where(F.col(c).isNull()).count())

Null values present in:
category: 0
product: 0
day: 0
brand: 0


In [15]:
count = df_new.count()
print(f"Total number of rows: {count:,d}")

Total number of rows: 53,612,307


In [16]:
df_new.describe().show()

+-------+--------------------+----------+--------------------+--------+------------------+--------------------+-----------+-----------------+------------------+
|summary|                  id|event_type|          product_id|   brand|             price|             user_id|   category|          product|               day|
+-------+--------------------+----------+--------------------+--------+------------------+--------------------+-----------+-----------------+------------------+
|  count|            53612307|  53612307|            53612307|53612307|          53612307|            53612307|   53612307|         53612307|          53612307|
|   mean|2.968796283566076E11|      null|1.5950927373301918E7|     NaN|300.63984015997363| 5.483450026974219E8|       null|             null|17.271609986863652|
| stddev|1.734894542916763E11|      null|2.7910823423054628E7|     NaN|362.00239914181367|2.8581691692769635E7|       null|             null| 8.578129789960611|
|    min|                   0|    

In [17]:
save_path = hadoop_path_dec + '/selected_Data/' 

df_new.repartition(8).write.mode('overwrite').option("header", "true").csv(save_path)
spark.stop()