In [1]:
from pyspark.find_spark_home import _find_spark_home
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
import pyspark.sql.functions as F
import os

from dotenv import load_dotenv
load_dotenv()

print(_find_spark_home())

C:\Users\jasmi\.conda\envs\bigData\Lib\site-packages\pyspark


In [2]:
python_path = os.environ.get('PYTHON_PATH')
app_name_nov = os.environ.get('APP_NAME_NOV')
hadoop_path_nov = os.environ.get('HADOOP_NOV_DATASET_PATH')

In [3]:
conf = SparkConf()\
    .setMaster('local[*]')\
    .set('spark-local-dir', "C:\\spark-temp")\
    .set('spark.driver.memory', '4g')\
    .set('spark.executor.memory', '4g')\
    .set('spark.driver.maxResultSize', '2g')\
    .set('spark.pyspark.python', python_path)\
    .set('spark.pyspark.driver.python', python_path)\
    .set("spark.network.timeout","800s")\
    .set("spark.dynamicAllocation.enabled", "true")\
    .set("spark.shuffle.service.enabled", "true")\
    .set("spark.dynamicAllocation.minExecutors", "1")\
    .set("spark.dynamicAllocation.maxExecutors", "10")\
    .set("spark.dynamicAllocation.executorIdleTimeout", "60s")\

spark = SparkSession.builder.appName(app_name_nov).config(conf=conf).getOrCreate()
sc = spark.sparkContext

for item in sc.getConf().getAll(): print(item)

('spark.dynamicAllocation.minExecutors', '1')
('spark.app.id', 'local-1702903859227')
('spark.shuffle.service.enabled', 'true')
('spark.driver.memory', '4g')
('spark.executor.memory', '4g')
('spark.driver.host', '194.47.40.208')
('spark.dynamicAllocation.maxExecutors', '10')
('spark.pyspark.python', 'C:/Users/jasmi/.conda/envs/bigData/python.exe')
('spark.executor.id', 'driver')
('spark.app.startTime', '1702903857782')
('spark.driver.port', '61786')
('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add

In [4]:
df = spark.read.csv(hadoop_path_nov, header = True, inferSchema = True)
df.show()

+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 01:00:00|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 01:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 01:00:01|      view|  17302664|2053013553853497655|                null|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 01:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 01:00:01|      view|   1004775|2053013555631882655|electronics.s

In [5]:
df_index = df.select('*').withColumn('id', F.monotonically_increasing_id())
column_names = ['id'] + [col for col in df.columns]

df_index_first = df_index.select(column_names)
df_index_first.show()

+---+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
| id|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+---+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|  0|2019-11-01 01:00:00|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|  1|2019-11-01 01:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|  2|2019-11-01 01:00:01|      view|  17302664|2053013553853497655|                null|   creed| 28.31|561587266|755422e7-9040-477...|
|  3|2019-11-01 01:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|  4|2019-11-01 01:00:01|      view|   1004775|2

In [6]:
df_new = df_index_first.drop('user_session', 'product_id','category_id')
df_new.show()

+---+-------------------+----------+--------------------+--------+------+---------+
| id|         event_time|event_type|       category_code|   brand| price|  user_id|
+---+-------------------+----------+--------------------+--------+------+---------+
|  0|2019-11-01 01:00:00|      view|electronics.smart...|  xiaomi|489.07|520088904|
|  1|2019-11-01 01:00:00|      view|appliances.sewing...|  janome|293.65|530496790|
|  2|2019-11-01 01:00:01|      view|                null|   creed| 28.31|561587266|
|  3|2019-11-01 01:00:01|      view|appliances.kitche...|      lg|712.87|518085591|
|  4|2019-11-01 01:00:01|      view|electronics.smart...|  xiaomi|183.27|558856683|
|  5|2019-11-01 01:00:01|      view|  computers.notebook|      hp|360.09|520772685|
|  6|2019-11-01 01:00:01|      view|  computers.notebook|      hp|514.56|514028527|
|  7|2019-11-01 01:00:02|      view|                null| rondell| 30.86|518574284|
|  8|2019-11-01 01:00:02|      view|                null|michelin| 72.72|532

In [7]:
df_new = df_new.dropna()
df_new.show()

+---+-------------------+----------+--------------------+-------+------+---------+
| id|         event_time|event_type|       category_code|  brand| price|  user_id|
+---+-------------------+----------+--------------------+-------+------+---------+
|  0|2019-11-01 01:00:00|      view|electronics.smart...| xiaomi|489.07|520088904|
|  1|2019-11-01 01:00:00|      view|appliances.sewing...| janome|293.65|530496790|
|  3|2019-11-01 01:00:01|      view|appliances.kitche...|     lg|712.87|518085591|
|  4|2019-11-01 01:00:01|      view|electronics.smart...| xiaomi|183.27|558856683|
|  5|2019-11-01 01:00:01|      view|  computers.notebook|     hp|360.09|520772685|
|  6|2019-11-01 01:00:01|      view|  computers.notebook|     hp|514.56|514028527|
|  9|2019-11-01 01:00:02|      view|electronics.smart...|  apple|732.07|532647354|
| 14|2019-11-01 01:00:05|      view|appliances.kitche...|samsung|411.83|526595547|
| 18|2019-11-01 01:00:07|      view|electronics.smart...| huawei|164.84|566265908|
| 19

In [8]:
print("Null values present in:")
for c in ["category_code", "brand"]:
    print(c +':', df_new.where(F.col(c).isNull()).count())

Null values present in:
category_code: 0
brand: 0


In [9]:
count = df_new.count()
print(f"Total number of rows: {count:,d}")

Total number of rows: 42,089,570


In [10]:
save_path = hadoop_path_nov + '/selected_Data/' 

df_new.repartition(8).write.mode('overwrite').option('header','true').csv(save_path)
spark.stop()