In [1]:
from pyspark.find_spark_home import _find_spark_home
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
import pyspark.sql.functions as F
from pyspark.sql.functions import col, count
import os

from dotenv import load_dotenv
load_dotenv()

print(_find_spark_home())

C:\Users\anama\anaconda3\envs\Bigdata\Lib\site-packages\pyspark


In [2]:
python_path = os.environ.get('PYTHON_PATH')
app_name_dec = os.environ.get('APP_NAME_DEC')
hadoop_path_dec = os.environ.get('HADOOP_DEC_DATASET_PATH')

In [3]:
conf = SparkConf()\
    .setMaster('local[*]')\
    .set('spark-local-dir', "C:\\spark-temp")\
    .set('spark.driver.memory', '4g')\
    .set('spark.executor.memory', '4g')\
    .set('spark.driver.maxResultSize', '2g')\
    .set('spark.pyspark.python', python_path)\
    .set('spark.pyspark.driver.python', python_path)\
    .set("spark.network.timeout","800s")\
    .set("spark.dynamicAllocation.enabled", "true")\
    .set("spark.shuffle.service.enabled", "true")\
    .set("spark.dynamicAllocation.minExecutors", "1")\
    .set("spark.dynamicAllocation.maxExecutors", "10")\
    .set("spark.dynamicAllocation.executorIdleTimeout", "60s")\

spark = SparkSession.builder.appName(app_name_dec).config(conf=conf).getOrCreate()
sc = spark.sparkContext

for item in sc.getConf().getAll(): print(item)

('spark.executor.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false')
('spark.dynamicAllocation.minExecutors', '1')
('spark.driver.host', 'BOOK-G6MUSCB12M')
('spark.shuffle.service.enabled', 'true')
('spark.driver.memory', '4g

In [4]:
df = spark.read.csv(hadoop_path_dec+'/selected_Data/', header = True, inferSchema = True)
df.show()

+------+----------+--------+-------+---------+------------+--------------------+---+
|    id|event_type|   brand|  price|  user_id|    category|             product|day|
+------+----------+--------+-------+---------+------------+--------------------+---+
|664791|      cart|   apple|1436.33|513035930|construction|         tools.light|  1|
|376610|      view|   midea| 252.23|512511396|  appliances|kitchen.refrigera...|  1|
|546957|      view|   apple| 532.83|560488744|construction|         tools.light|  1|
|383524|      view|    akom|   79.8|514686387|   furniture|         bedroom.bed|  1|
|276229|      view|    sony|  30.09|514479293|       sport|             bicycle|  1|
|897219|      cart|   apple|1621.41|522181460|construction|         tools.light|  1|
|390469|      view|    beko| 180.16|570967288|  appliances|      kitchen.washer|  1|
|508720|      view|      hp| 567.58|548461791|   computers|             desktop|  1|
|519760|      view|starline| 145.18|513621867|     apparel|      

In [6]:

df_sales = df.filter(col("event_type") == "purchase")

df_grouped = df_sales.groupBy("category").agg(count("id").alias("sales_count"))

df_top10 = df_grouped.orderBy(col("sales_count").desc()).limit(10)

df_top10.show()

+------------+-----------+
|    category|sales_count|
+------------+-----------+
|construction|     513806|
|  appliances|     163635|
|     apparel|      91649|
| electronics|      84729|
|       sport|      67901|
|   furniture|      31466|
|   computers|      26039|
|        kids|      14424|
| accessories|       6276|
|        auto|       5824|
+------------+-----------+



In [7]:
for row in df_top10.collect():
    print(row)

Row(category='construction', sales_count=513806)
Row(category='appliances', sales_count=163635)
Row(category='apparel', sales_count=91649)
Row(category='electronics', sales_count=84729)
Row(category='sport', sales_count=67901)
Row(category='furniture', sales_count=31466)
Row(category='computers', sales_count=26039)
Row(category='kids', sales_count=14424)
Row(category='accessories', sales_count=6276)
Row(category='auto', sales_count=5824)


In [8]:
df_sales = df.filter(col("event_type") == "view")

df_grouped = df_sales.groupBy("category").agg(count("id").alias("sales_count"))

df_top10 = df_grouped.orderBy(col("sales_count").desc()).limit(10)

df_top10.show()

+------------+-----------+
|    category|sales_count|
+------------+-----------+
|construction|   15873789|
|  appliances|   10321246|
|     apparel|    6824523|
| electronics|    6237862|
|       sport|    2995544|
|   furniture|    2399403|
|   computers|    2388011|
|        kids|    1345026|
| accessories|     728651|
|        auto|     503282|
+------------+-----------+



In [10]:
df_sales = df.filter(col("event_type") == "purchase")

df_grouped = df_sales.groupBy("brand").agg(count("id").alias("sales_count_brand"))

df_top10 = df_grouped.orderBy(col("sales_count_brand").desc()).limit(10)

df_top10.show()

+-------+-----------------+
|  brand|sales_count_brand|
+-------+-----------------+
|samsung|           265773|
|  apple|           209511|
| xiaomi|           101500|
| huawei|            45118|
|lucente|            21010|
|   oppo|            19843|
|   sony|            16526|
|     lg|            16191|
|  artel|             9211|
| lenovo|             8711|
+-------+-----------------+



In [11]:
df_sales = df.filter(col("event_type") == "view")

df_grouped = df_sales.groupBy("brand").agg(count("id").alias("sales_count_brand"))

df_top10 = df_grouped.orderBy(col("sales_count_brand").desc()).limit(10)

df_top10.show()

+-------+-----------------+
|  brand|sales_count_brand|
+-------+-----------------+
|samsung|          7563547|
|  apple|          4880173|
| xiaomi|          4738632|
| huawei|          1736478|
|lucente|          1159695|
|     lg|           878712|
|   sony|           844867|
|   oppo|           703464|
| lenovo|           601244|
|  bosch|           533426|
+-------+-----------------+

