**Dataset Preparation**

In [1]:
import pandas as pd
from datetime import datetime

# Sample sales data
data = {
    "TransactionID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "CustomerID": [101, 102, 103, 101, 104, 102, 103, 104, 101, 105],
    "ProductID": [501, 502, 501, 503, 504, 502, 503, 504, 501, 505],
    "Quantity": [2, 1, 4, 3, 1, 2, 5, 1, 2, 1],
    "Price": [150.0, 250.0, 150.0, 300.0, 450.0, 250.0, 300.0, 450.0, 150.0, 550.0],
    "Date": [
        datetime(2024, 9, 1),
        datetime(2024, 9, 1),
        datetime(2024, 9, 2),
        datetime(2024, 9, 2),
        datetime(2024, 9, 3),
        datetime(2024, 9, 3),
        datetime(2024, 9, 4),
        datetime(2024, 9, 4),
        datetime(2024, 9, 5),
        datetime(2024, 9, 5)
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('sales_data.csv', index=False)

print("Sample sales dataset has been created and saved as 'sales_data.csv'.")


Sample sales dataset has been created and saved as 'sales_data.csv'.


In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=f57e00357d3ac1e5bdb4ffc125e90983417b08155ecfca2b8c807f87164cc5e2
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


**Load the Dataset into PySpark**

In [4]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as pyspark_sum

# Initialize the SparkSession
spark = SparkSession.builder \
    .appName("Sales Dataset Analysis") \
    .getOrCreate()

# Load the CSV file into a DataFrame
df = spark.read.csv('sales_data.csv', header=True, inferSchema=True)

# Display the first few rows
df.show()


+-------------+----------+---------+--------+-----+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|
+-------------+----------+---------+--------+-----+----------+
|            1|       101|      501|       2|150.0|2024-09-01|
|            2|       102|      502|       1|250.0|2024-09-01|
|            3|       103|      501|       4|150.0|2024-09-02|
|            4|       101|      503|       3|300.0|2024-09-02|
|            5|       104|      504|       1|450.0|2024-09-03|
|            6|       102|      502|       2|250.0|2024-09-03|
|            7|       103|      503|       5|300.0|2024-09-04|
|            8|       104|      504|       1|450.0|2024-09-04|
|            9|       101|      501|       2|150.0|2024-09-05|
|           10|       105|      505|       1|550.0|2024-09-05|
+-------------+----------+---------+--------+-----+----------+



**Explore the Data**


In [6]:
#Print the Schema:
df.printSchema()

#Show the First Few Rows:
df.show(5)

#Get Summary Statistics:
df.describe(["Quantity", "Price"]).show()


root
 |-- TransactionID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- Date: date (nullable = true)

+-------------+----------+---------+--------+-----+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|
+-------------+----------+---------+--------+-----+----------+
|            1|       101|      501|       2|150.0|2024-09-01|
|            2|       102|      502|       1|250.0|2024-09-01|
|            3|       103|      501|       4|150.0|2024-09-02|
|            4|       101|      503|       3|300.0|2024-09-02|
|            5|       104|      504|       1|450.0|2024-09-03|
+-------------+----------+---------+--------+-----+----------+
only showing top 5 rows

+-------+-----------------+-----------------+
|summary|         Quantity|            Price|
+-------+-----------------+-----------------+
|  count|            

**Perform Data Transformations and Analysis**

In [7]:
# 1.Calculate the Total Sales Value for Each Transaction
df = df.withColumn("TotalSales", col("Quantity") * col("Price"))
df.show()


+-------------+----------+---------+--------+-----+----------+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|TotalSales|
+-------------+----------+---------+--------+-----+----------+----------+
|            1|       101|      501|       2|150.0|2024-09-01|     300.0|
|            2|       102|      502|       1|250.0|2024-09-01|     250.0|
|            3|       103|      501|       4|150.0|2024-09-02|     600.0|
|            4|       101|      503|       3|300.0|2024-09-02|     900.0|
|            5|       104|      504|       1|450.0|2024-09-03|     450.0|
|            6|       102|      502|       2|250.0|2024-09-03|     500.0|
|            7|       103|      503|       5|300.0|2024-09-04|    1500.0|
|            8|       104|      504|       1|450.0|2024-09-04|     450.0|
|            9|       101|      501|       2|150.0|2024-09-05|     300.0|
|           10|       105|      505|       1|550.0|2024-09-05|     550.0|
+-------------+----------+---------+--

In [8]:
#2.Group By ProductID and Calculate Total Sales Per Product:

sales_per_product = df.groupBy("ProductID").agg(pyspark_sum("TotalSales").alias("TotalSales"))
sales_per_product.show()


+---------+----------+
|ProductID|TotalSales|
+---------+----------+
|      501|    1200.0|
|      504|     900.0|
|      502|     750.0|
|      505|     550.0|
|      503|    2400.0|
+---------+----------+



In [9]:
#3. Identify the Top-Selling Product:

top_selling_product = sales_per_product.orderBy(col("TotalSales").desc()).first()
print("Top-Selling Product ID:", top_selling_product["ProductID"])
print("Total Sales for Top-Selling Product:", top_selling_product["TotalSales"])


Top-Selling Product ID: 503
Total Sales for Top-Selling Product: 2400.0


In [10]:
#4. Calculate the Total Sales by Date:

sales_by_date = df.groupBy("Date").agg(pyspark_sum("TotalSales").alias("TotalSales"))
sales_by_date.show()


+----------+----------+
|      Date|TotalSales|
+----------+----------+
|2024-09-03|     950.0|
|2024-09-01|     550.0|
|2024-09-02|    1500.0|
|2024-09-05|     850.0|
|2024-09-04|    1950.0|
+----------+----------+



In [11]:
#5. Filter High-Value Transactions:

high_value_transactions = df.filter(col("TotalSales") > 500)
high_value_transactions.show()


+-------------+----------+---------+--------+-----+----------+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|TotalSales|
+-------------+----------+---------+--------+-----+----------+----------+
|            3|       103|      501|       4|150.0|2024-09-02|     600.0|
|            4|       101|      503|       3|300.0|2024-09-02|     900.0|
|            7|       103|      503|       5|300.0|2024-09-04|    1500.0|
|           10|       105|      505|       1|550.0|2024-09-05|     550.0|
+-------------+----------+---------+--------+-----+----------+----------+



**Additional challenge**

In [12]:
#1.Identify repeat customers

repeat_customers = df.groupBy("CustomerID").count().filter(col("count") > 1)
repeat_customers.show()


+----------+-----+
|CustomerID|count|
+----------+-----+
|       101|    3|
|       103|    2|
|       102|    2|
|       104|    2|
+----------+-----+



In [14]:
#2.Calculate the Average Sale Price Per Product:

average_price_per_product = df.groupBy("ProductID").agg(
    (pyspark_sum("Price") / pyspark_sum("Quantity")).alias("AveragePricePerUnit")
)
average_price_per_product.show()


+---------+-------------------+
|ProductID|AveragePricePerUnit|
+---------+-------------------+
|      501|              56.25|
|      504|              450.0|
|      502| 166.66666666666666|
|      505|              550.0|
|      503|               75.0|
+---------+-------------------+

