Data manipulations

In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=721f8748cb3fdc7fe29d37ba47eee45b3c70fba09f059376917530b166f5a8e6
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Advanced DataFrame Operations - Different Dataset") \
    .getOrCreate()

# Create two sample DataFrames for Product Sales
data1 = [
    (1, 'Product A', 'Electronics', 1200, '2022-05-10'),
    (2, 'Product B', 'Clothing', 500, '2022-07-15'),
    (3, 'Product C', 'Electronics', 1800, '2021-11-05')
]

data2 = [
    (4, 'Product D', 'Furniture', 3000, '2022-03-25'),
    (5, 'Product E', 'Clothing', 800, '2022-09-12'),
    (6, 'Product F', 'Electronics', 1500, '2021-10-19')
]

# Define schema (columns)
columns = ['ProductID', 'ProductName', 'Category', 'Price', 'SaleDate']

# Create DataFrames
sales_df1 = spark.createDataFrame(data1, columns)
sales_df2 = spark.createDataFrame(data2, columns)

# Show the DataFrames
print("sales DataFrame 1:")
sales_df1.show()

print("sales DataFrame 2:")
sales_df2.show()


sales DataFrame 1:
+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
+---------+-----------+-----------+-----+----------+

sales DataFrame 2:
+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+



Union of DataFrames (removing duplicates)


In [5]:
union_df = sales_df1.union(sales_df2).dropDuplicates()
union_df.show()

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        6|  Product F|Electronics| 1500|2021-10-19|
|        5|  Product E|   Clothing|  800|2022-09-12|
+---------+-----------+-----------+-----+----------+



Union of DataFrames (including duplicates

In [6]:
union_all_df = sales_df1.union(sales_df2)
union_all_df.show()

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+



Rank products by price within their category

In [7]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
from pyspark.sql.functions import col

window_spec = Window.partitionBy("Category").orderBy(col("Price").desc())

# Add a rank column to the DataFrame
ranked_df = union_all_df.withColumn("Rank", rank().over(window_spec))
ranked_df.show()

+---------+-----------+-----------+-----+----------+----+
|ProductID|ProductName|   Category|Price|  SaleDate|Rank|
+---------+-----------+-----------+-----+----------+----+
|        5|  Product E|   Clothing|  800|2022-09-12|   1|
|        2|  Product B|   Clothing|  500|2022-07-15|   2|
|        3|  Product C|Electronics| 1800|2021-11-05|   1|
|        6|  Product F|Electronics| 1500|2021-10-19|   2|
|        1|  Product A|Electronics| 1200|2022-05-10|   3|
|        4|  Product D|  Furniture| 3000|2022-03-25|   1|
+---------+-----------+-----------+-----+----------+----+



Calculate cumulative price per category

In [8]:
y
cumulative_price_df = union_df.withColumn('CumulativePrice', F.sum('Price').over(window_spec))
cumulative_price_df.show()


+---------+-----------+-----------+-----+----------+---------------+
|ProductID|ProductName|   Category|Price|  SaleDate|CumulativePrice|
+---------+-----------+-----------+-----+----------+---------------+
|        5|  Product E|   Clothing|  800|2022-09-12|            800|
|        2|  Product B|   Clothing|  500|2022-07-15|           1300|
|        3|  Product C|Electronics| 1800|2021-11-05|           1800|
|        6|  Product F|Electronics| 1500|2021-10-19|           3300|
|        1|  Product A|Electronics| 1200|2022-05-10|           4500|
|        4|  Product D|  Furniture| 3000|2022-03-25|           3000|
+---------+-----------+-----------+-----+----------+---------------+



Convert SaleDate from string to date type

In [9]:

date_converted_df = union_df.withColumn('SaleDate', F.to_date('SaleDate', 'yyyy-MM-dd'))
date_converted_df.show()


+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        6|  Product F|Electronics| 1500|2021-10-19|
|        5|  Product E|   Clothing|  800|2022-09-12|
+---------+-----------+-----------+-----+----------+



Calculate the number of days since each sale

In [10]:

days_since_sale_df = date_converted_df.withColumn('DaysSinceSale', F.datediff(F.current_date(), 'SaleDate'))
days_since_sale_df.show()


+---------+-----------+-----------+-----+----------+-------------+
|ProductID|ProductName|   Category|Price|  SaleDate|DaysSinceSale|
+---------+-----------+-----------+-----+----------+-------------+
|        1|  Product A|Electronics| 1200|2022-05-10|          848|
|        2|  Product B|   Clothing|  500|2022-07-15|          782|
|        3|  Product C|Electronics| 1800|2021-11-05|         1034|
|        4|  Product D|  Furniture| 3000|2022-03-25|          894|
|        6|  Product F|Electronics| 1500|2021-10-19|         1051|
|        5|  Product E|   Clothing|  800|2022-09-12|          723|
+---------+-----------+-----------+-----+----------+-------------+



Add a column for the next sale deadline

In [12]:
# Add a new column for the next sale deadline (30 days after the sale)
next_sale_deadline_df = date_converted_df.withColumn('NextSaleDeadline', F.date_add('SaleDate', 30))
next_sale_deadline_df.show()


+---------+-----------+-----------+-----+----------+----------------+
|ProductID|ProductName|   Category|Price|  SaleDate|NextSaleDeadline|
+---------+-----------+-----------+-----+----------+----------------+
|        1|  Product A|Electronics| 1200|2022-05-10|      2022-06-09|
|        2|  Product B|   Clothing|  500|2022-07-15|      2022-08-14|
|        3|  Product C|Electronics| 1800|2021-11-05|      2021-12-05|
|        4|  Product D|  Furniture| 3000|2022-03-25|      2022-04-24|
|        6|  Product F|Electronics| 1500|2021-10-19|      2021-11-18|
|        5|  Product E|   Clothing|  800|2022-09-12|      2022-10-12|
+---------+-----------+-----------+-----+----------+----------------+



Calculate total revenue and average price per category

In [13]:

revenue_avg_price_df = union_df.groupBy('Category').agg(
    F.sum('Price').alias('TotalRevenue'),
    F.avg('Price').alias('AveragePrice')
)
revenue_avg_price_df.show()


+-----------+------------+------------+
|   Category|TotalRevenue|AveragePrice|
+-----------+------------+------------+
|Electronics|        4500|      1500.0|
|   Clothing|        1300|       650.0|
|  Furniture|        3000|      3000.0|
+-----------+------------+------------+



Convert all product names to lowercase

In [14]:
# Convert all product names to lowercase
lower_name_df = union_df.withColumn("ProductNameLower", F.lower(col("ProductName")))
lower_name_df.show()

+---------+-----------+-----------+-----+----------+----------------+
|ProductID|ProductName|   Category|Price|  SaleDate|ProductNameLower|
+---------+-----------+-----------+-----+----------+----------------+
|        1|  Product A|Electronics| 1200|2022-05-10|       product a|
|        2|  Product B|   Clothing|  500|2022-07-15|       product b|
|        3|  Product C|Electronics| 1800|2021-11-05|       product c|
|        4|  Product D|  Furniture| 3000|2022-03-25|       product d|
|        6|  Product F|Electronics| 1500|2021-10-19|       product f|
|        5|  Product E|   Clothing|  800|2022-09-12|       product e|
+---------+-----------+-----------+-----+----------+----------------+

