In [1]:
from pyspark.find_spark_home import _find_spark_home
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
import pyspark.sql.functions as F
import os

from dotenv import load_dotenv
load_dotenv()

print(_find_spark_home())

C:\Users\anama\anaconda3\envs\Bigdata\Lib\site-packages\pyspark


In [2]:
python_path = os.environ.get('PYTHON_PATH')
app_name_dec = os.environ.get('APP_NAME_DEC')
hadoop_path_dec = os.environ.get('HADOOP_DEC_DATASET_PATH')

In [3]:
conf = SparkConf()\
    .setMaster('local[*]')\
    .set('spark-local-dir', "C:\\spark-temp")\
    .set('spark.driver.memory', '4g')\
    .set('spark.executor.memory', '4g')\
    .set('spark.driver.maxResultSize', '2g')\
    .set('spark.pyspark.python', python_path)\
    .set('spark.pyspark.driver.python', python_path)\
    .set("spark.network.timeout","800s")\
    .set("spark.dynamicAllocation.enabled", "true")\
    .set("spark.shuffle.service.enabled", "true")\
    .set("spark.dynamicAllocation.minExecutors", "1")\
    .set("spark.dynamicAllocation.maxExecutors", "10")\
    .set("spark.dynamicAllocation.executorIdleTimeout", "60s")\

spark = SparkSession.builder.appName(app_name_dec).config(conf=conf).getOrCreate()
sc = spark.sparkContext

for item in sc.getConf().getAll(): print(item)

('spark.driver.host', '194.47.46.114')
('spark.app.startTime', '1702902572661')
('spark.dynamicAllocation.minExecutors', '1')
('spark.executor.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false')
('spark.driver.port', '60496')

In [4]:
df = spark.read.csv(hadoop_path_dec, header = True, inferSchema = True)
df.show()

+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|2019-12-01 01:00:00|      view|   1005105|2232732093077520756|construction.tool...|  apple|1302.48|556695836|ca5eefc5-11f9-450...|
|2019-12-01 01:00:00|      view|  22700068|2232732091643068746|                null|  force| 102.96|577702456|de33debe-c7bf-44e...|
|2019-12-01 01:00:01|      view|   2402273|2232732100769874463|appliances.person...|  bosch| 313.52|539453785|5ee185a7-0689-4a3...|
|2019-12-01 01:00:02|  purchase|  26400248|2053013553056579841|computers.periphe...|   null| 132.31|535135317|61792a26-672f-4e6...|
|2019-12-01 01:00:02|      view|  20100164|2232732110089618156|    apparel.t

In [5]:
df_index = df.select('*').withColumn('id', F.monotonically_increasing_id())
column_names = ['id'] + [col for col in df.columns]

df_index_first = df_index.select(column_names)
df_index_first.show()

+---+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
| id|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|
+---+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|  0|2019-12-01 01:00:00|      view|   1005105|2232732093077520756|construction.tool...|  apple|1302.48|556695836|ca5eefc5-11f9-450...|
|  1|2019-12-01 01:00:00|      view|  22700068|2232732091643068746|                null|  force| 102.96|577702456|de33debe-c7bf-44e...|
|  2|2019-12-01 01:00:01|      view|   2402273|2232732100769874463|appliances.person...|  bosch| 313.52|539453785|5ee185a7-0689-4a3...|
|  3|2019-12-01 01:00:02|  purchase|  26400248|2053013553056579841|computers.periphe...|   null| 132.31|535135317|61792a26-672f-4e6...|
|  4|2019-12-01 01:00:02|      view|  20100164|2

In [6]:
df_new = df_index_first.drop('user_session', 'product_id','category_id')
df_new.show()

+---+-------------------+----------+--------------------+-------+-------+---------+
| id|         event_time|event_type|       category_code|  brand|  price|  user_id|
+---+-------------------+----------+--------------------+-------+-------+---------+
|  0|2019-12-01 01:00:00|      view|construction.tool...|  apple|1302.48|556695836|
|  1|2019-12-01 01:00:00|      view|                null|  force| 102.96|577702456|
|  2|2019-12-01 01:00:01|      view|appliances.person...|  bosch| 313.52|539453785|
|  3|2019-12-01 01:00:02|  purchase|computers.periphe...|   null| 132.31|535135317|
|  4|2019-12-01 01:00:02|      view|    apparel.trousers|   nika| 101.68|517987650|
|  5|2019-12-01 01:00:02|      view|accessories.umbrella|   ikea| 163.56|542860793|
|  6|2019-12-01 01:00:02|      view|  electronics.clocks|   null|  88.81|538021416|
|  7|2019-12-01 01:00:03|      view|construction.tool...| xiaomi| 256.38|525740700|
|  8|2019-12-01 01:00:04|      view|  computers.notebook|    jet|  20.57|512

In [7]:
df_new = df_new.dropna()
df_new.show()

+---+-------------------+----------+--------------------+---------+-------+---------+
| id|         event_time|event_type|       category_code|    brand|  price|  user_id|
+---+-------------------+----------+--------------------+---------+-------+---------+
|  0|2019-12-01 01:00:00|      view|construction.tool...|    apple|1302.48|556695836|
|  2|2019-12-01 01:00:01|      view|appliances.person...|    bosch| 313.52|539453785|
|  4|2019-12-01 01:00:02|      view|    apparel.trousers|     nika| 101.68|517987650|
|  5|2019-12-01 01:00:02|      view|accessories.umbrella|     ikea| 163.56|542860793|
|  7|2019-12-01 01:00:03|      view|construction.tool...|   xiaomi| 256.38|525740700|
|  8|2019-12-01 01:00:04|      view|  computers.notebook|      jet|  20.57|512509221|
| 10|2019-12-01 01:00:04|      view|  computers.notebook|    vegas|  49.94|554369617|
| 11|2019-12-01 01:00:04|      view|construction.tool...|    apple|1312.52|579969851|
| 13|2019-12-01 01:00:06|      view|construction.tool.

In [8]:
print("Null values present in:")
for c in ["category_code", "brand"]:
    print(c +':', df_new.where(F.col(c).isNull()).count())

Null values present in:
category_code: 0
brand: 0


In [9]:
count = df_new.count()
print(f"Total number of rows: {count:,d}")

Total number of rows: 53,612,307


In [10]:
save_path = hadoop_path_dec + '/selected_Data/' 

df_new.repartition(8).write.mode('overwrite').option("header", "true").csv(save_path)
spark.stop()