# Analysis of eCommerce data in Multi-Category Store:
## November

In [1]:
import os
from pyspark.find_spark_home import _find_spark_home
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
print(_find_spark_home())

C:\Users\jasmi\.conda\envs\bigData\Lib\site-packages\pyspark


In [3]:
python_path = os.environ.get('PYTHON_PATH')
app_name_nov = os.environ.get('APP_NAME_NOV')
hadoop_path_nov = os.environ.get('HADOOP_NOV_DATASET_PATH')

In [4]:
conf = SparkConf()\
    .setMaster('local[*]')\
    .set('spark-local-dir', "C:\\spark-temp")\
    .set('spark.driver.memory', '4g')\
    .set('spark.executor.memory', '4g')\
    .set('spark.driver.maxResultSize', '2g')\
    .set('spark.pyspark.python', python_path)\
    .set('spark.pyspark.driver.python', python_path)\
    .set('spark.network.timeout', '600s')

spark = SparkSession.builder.appName(app_name_nov).config(conf=conf).getOrCreate()
sc = spark.sparkContext

for item in sc.getConf().getAll(): print(item)

('spark.driver.port', '58645')
('spark.network.timeout', '600s')
('spark.driver.memory', '4g')
('spark.executor.memory', '4g')
('spark.driver.host', '194.47.40.208')
('spark.pyspark.python', 'C:/Users/jasmi/.conda/envs/bigData/python.exe')
('spark.executor.id', 'driver')
('spark.app.id', 'local-1702902302640')
('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --a

## Check the original dataset

In [5]:
df = spark.read.csv(hadoop_path_nov, header=True, inferSchema=True)
df.show()

+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 01:00:00|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 01:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 01:00:01|      view|  17302664|2053013553853497655|                null|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 01:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 01:00:01|      view|   1004775|2053013555631882655|electronics.s

In [6]:
count = df.count()
print(f"Total number of rows: {count:,d}")

Total number of rows: 67,501,979


In [7]:
cols = df.columns
print("Columns in out dataset:")
print(cols)

Columns in out dataset:
['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session']


In [8]:
print("Null values present in:")
for c in cols:
    print(c +':', df.where(F.col(c).isNull()).count())

Null values present in:
event_time: 0
event_type: 0
product_id: 0
category_id: 0
category_code: 21898171
brand: 9218235
price: 0
user_id: 0
user_session: 10


In [9]:
# Not showing: event_time, price, user_session
cols_count_unique = ['event_type', 'product_id', 'category_id', 'category_code', 'brand', 'user_id']
print("Count unique values in:")

for c in cols_count_unique:
    df.select(F.count_distinct(c)).show() 

Count unique values in:
+--------------------------+
|count(DISTINCT event_type)|
+--------------------------+
|                         3|
+--------------------------+

+--------------------------+
|count(DISTINCT product_id)|
+--------------------------+
|                    190662|
+--------------------------+

+---------------------------+
|count(DISTINCT category_id)|
+---------------------------+
|                        684|
+---------------------------+

+-----------------------------+
|count(DISTINCT category_code)|
+-----------------------------+
|                          129|
+-----------------------------+

+---------------------+
|count(DISTINCT brand)|
+---------------------+
|                 4201|
+---------------------+

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                3696117|
+-----------------------+



In [10]:
cols_show_unique = ['event_type', 'brand', 'category_code']
print("Show unique values in:")

for c in cols_count_unique:
    df.groupBy(c).count().show()

Show unique values in:
+----------+--------+
|event_type|   count|
+----------+--------+
|  purchase|  916939|
|      view|63556110|
|      cart| 3028930|
+----------+--------+

+----------+-----+
|product_id|count|
+----------+-----+
|   1004739|79375|
|   2702332| 5386|
|  29100052| 1138|
|  16700154|   55|
|   1005158|64413|
|   3701000| 2186|
|   1004666| 4503|
|   1480743| 1807|
|   1801530|  698|
|  17200699| 2983|
|  28707967|   37|
|  15600022|  987|
|  13105408|   48|
|  28714713|   44|
|  17200615|  120|
|   9200512|  162|
|  12707478| 1237|
|  11700322|  793|
|  48300076|  621|
|  12704749|  784|
+----------+-----+
only showing top 20 rows

+-------------------+--------+
|        category_id|   count|
+-------------------+--------+
|2053013563944993659|  160932|
|2053013566209917945|  159193|
|2053013561554240247|   73429|
|2145727399348666855|    4069|
|2060237588744111062|    4038|
|2098563460336976001|    1619|
|2053013564968403895|   31380|
|2095518906859913319|   23491|