# Analysis of eCommerce data in Multi-Category Store:
## December

In [1]:
import os
from pyspark.find_spark_home import _find_spark_home
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
print(_find_spark_home())

C:\Users\liisu\anaconda3\envs\bigData\Lib\site-packages\pyspark


In [3]:
python_path = os.environ.get('PYTHON_PATH')
app_name_dec = os.environ.get('APP_NAME_DEC')
hadoop_path_dec = os.environ.get('HADOOP_DEC_DATASET_PATH')

In [4]:
conf = SparkConf()\
    .setMaster('local[*]')\
    .set('spark-local-dir', "C:\\spark-temp")\
    .set('spark.driver.memory', '4g')\
    .set('spark.executor.memory', '4g')\
    .set('spark.driver.maxResultSize', '2g')\
    .set('spark.pyspark.python', python_path)\
    .set('spark.pyspark.driver.python', python_path)\
    .set('spark.network.timeout', '600s')

spark = SparkSession.builder.appName(app_name_dec).config(conf=conf).getOrCreate()
sc = spark.sparkContext

for item in sc.getConf().getAll(): print(item)

('spark.driver.host', 'host.docker.internal')
('spark.app.id', 'local-1703268736794')
('spark.network.timeout', '600s')
('spark.driver.memory', '4g')
('spark.executor.memory', '4g')
('spark.driver.port', '63853')
('spark.executor.id', 'driver')
('spark.pyspark.driver.python', 'C:/Users/liisu/anaconda3/envs/bigData/python.exe')
('spark.app.submitTime', '1703268735009')
('spark.app.startTime', '1703268735163')
('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --

## Check the original dataset

In [5]:
df = spark.read.csv(hadoop_path_dec, header=True, inferSchema=True)
df.show()

+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|2019-12-01 01:00:00|      view|   1005105|2232732093077520756|construction.tool...|  apple|1302.48|556695836|ca5eefc5-11f9-450...|
|2019-12-01 01:00:00|      view|  22700068|2232732091643068746|                null|  force| 102.96|577702456|de33debe-c7bf-44e...|
|2019-12-01 01:00:01|      view|   2402273|2232732100769874463|appliances.person...|  bosch| 313.52|539453785|5ee185a7-0689-4a3...|
|2019-12-01 01:00:02|  purchase|  26400248|2053013553056579841|computers.periphe...|   null| 132.31|535135317|61792a26-672f-4e6...|
|2019-12-01 01:00:02|      view|  20100164|2232732110089618156|    apparel.t

In [8]:
count = df.count()
print(f"Total number of rows: {count:,d}")

Total number of rows: 67,542,878


In [6]:
df.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [9]:
cols = df.columns
print("Columns in out dataset:")
print(cols)

Columns in out dataset:
['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session']


In [9]:
print("Null values present in:")
for c in cols:
    print(c +':', df.where(F.col(c).isNull()).count())

Null values present in:
event_time: 0
event_type: 0
product_id: 0
category_id: 0
category_code: 7088848
brand: 8115813
price: 0
user_id: 0
user_session: 21


In [10]:
# Not showing: event_time, price, user_session
cols_count_unique = ['event_type', 'product_id', 'category_id', 'category_code', 'brand', 'user_id']
print("Count unique values in:")

for c in cols_count_unique:
    df.select(F.count_distinct(c)).show() 

Count unique values in:
+--------------------------+
|count(DISTINCT event_type)|
+--------------------------+
|                         3|
+--------------------------+

+--------------------------+
|count(DISTINCT product_id)|
+--------------------------+
|                    205230|
+--------------------------+

+---------------------------+
|count(DISTINCT category_id)|
+---------------------------+
|                       1162|
+---------------------------+

+-----------------------------+
|count(DISTINCT category_code)|
+-----------------------------+
|                          135|
+-----------------------------+

+---------------------+
|count(DISTINCT brand)|
+---------------------+
|                 4638|
+---------------------+

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                4577232|
+-----------------------+



In [10]:
cols_show_unique = ['event_type', 'brand', 'category_code']
print("Show unique values in:")

for c in cols_show_unique:
    df.groupBy(c).count().show()

Show unique values in:
+----------+--------+
|event_type|   count|
+----------+--------+
|  purchase| 1162048|
|      view|62986067|
|      cart| 3394763|
+----------+--------+

+--------------+-----+
|         brand|count|
+--------------+-----+
|      yokohama|29306|
|       edifier| 4385|
|houseofseasons|22177|
|         crest|  493|
|          tega| 2592|
|       tuffoni|  207|
|         welss| 3565|
|       serebro| 7303|
|          tmnt|  987|
|          dvin|  107|
|       blaster|  694|
|alpinapabliser| 1391|
|      norplast|  319|
|          hipp|  391|
|      nutricia| 2783|
|         sigma| 3570|
|       bombbar| 1058|
|         sonel| 9164|
|       keenway| 2327|
|         lotos| 2722|
+--------------+-----+
only showing top 20 rows

+--------------------+-------+
|       category_code|  count|
+--------------------+-------+
|apparel.shoes.sli...| 653631|
|    computers.ebooks| 105991|
|computers.periphe...|  12109|
|construction.tool...|  18304|
|electronics.video...| 2296