In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=91987410ab516eb43718fe11aed7d60a14d06800e37bbd3a07c28e57505441c7
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


# E-commerce Transactions


In [14]:
import pyspark
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("E-commerce Analysis").getOrCreate()

In [15]:
# Load the data into a DataFrame
df = spark.read.csv("ecommerce_data.csv", header=True, inferSchema=True)

**1. Calculate the Total Revenue per Category**

In [None]:
from pyspark.sql.functions import col, sum, when

# Calculate the actual revenue considering discount
df = df.withColumn("actual_revenue", (col("price") * col("quantity")) * (1 - (col("discount_percentage") / 100)))

# Group by category and sum the actual revenue
total_revenue_by_category = df.groupBy("category").agg(sum("actual_revenue").alias("total_revenue"))

total_revenue_by_category.show()

**2. Filter Transactions with a Discount Greater Than 10%**

In [17]:
from pyspark.sql.functions import col, sum, when # Make sure to import the col function from pyspark.sql.functions

filtered_df = df.filter(col("discount_percentage") > 10)
filtered_df.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+-------------------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|         transacti |
+--------------+-----------+------------+--------------+-----+--------+-------------------+-------------------+
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|2023-08-03 00:00:00|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|2023-08-04 00:00:00|
|             7|        106|Refrigerator|Home Appliance|  800|       1|                 25|2023-08-05 00:00:00|
+--------------+-----------+------------+--------------+-----+--------+-------------------+-------------------+



**3. Find the Most Expensive Product Sold**

In [18]:
most_expensive_product = df.orderBy(col("price").desc()).select("product", "price").first()
print("Most expensive product:", most_expensive_product)

Most expensive product: Row(product='Laptop', price=1000)


**4. Calculate the Average Quantity of Products Sold per Category**

In [20]:
from pyspark.sql.functions import col, sum, when, avg # Import the avg function

average_quantity_by_category = df.groupBy("category").agg(avg("quantity").alias("average_quantity"))
average_quantity_by_category.show()

+--------------+----------------+
|      category|average_quantity|
+--------------+----------------+
|       Fashion|             2.0|
|   Electronics|            1.75|
|         Books|             4.0|
|Home Appliance|             1.0|
+--------------+----------------+



**5. Identify Customers Who Purchased More Than One Product**

In [22]:
from pyspark.sql.functions import col, sum, when, avg, count # Import the count function


customers_with_multiple_products = df.groupBy("customer_id").agg(count("product").alias("product_count")).filter(col("product_count") > 1)
customers_with_multiple_products.show()

+-----------+-------------+
|customer_id|product_count|
+-----------+-------------+
|        101|            2|
|        102|            2|
+-----------+-------------+



**6. Find the Top 3 Highest Revenue Transactions**

In [24]:
from pyspark.sql.functions import col, sum, when, avg, count

# Calculate revenue
df = df.withColumn("revenue", col("price") * col("quantity") * (1 - col("discount_percentage")))

# Order by revenue and get the top 3 transactions
top_revenue_transactions = df.orderBy(col("revenue").desc()).limit(3)
top_revenue_transactions.show()

+--------------+-----------+-------+--------------+-----+--------+-------------------+-------------------+-------+
|transaction_id|customer_id|product|      category|price|quantity|discount_percentage|         transacti |revenue|
+--------------+-----------+-------+--------------+-----+--------+-------------------+-------------------+-------+
|             3|        103|  Shirt|       Fashion|   40|       3|                  0|2023-08-02 00:00:00|    120|
|             8|        107|   Book|         Books|   20|       4|                  0|2023-08-05 00:00:00|     80|
|             9|        108|Toaster|Home Appliance|   30|       1|                  5|2023-08-06 00:00:00|   -120|
+--------------+-----------+-------+--------------+-----+--------+-------------------+-------------------+-------+



**7. Calculate the Total Number of Transactions per Day**

In [26]:
from pyspark.sql.functions import col, sum, when, avg, count

# Calculate transactions per day from the Spark DataFrame 'df'
transactions_per_day = df.groupBy("transaction_id").agg(count("*").alias("transaction_count"))

# Show the result
transactions_per_day.show()

+--------------+-----------------+
|transaction_id|transaction_count|
+--------------+-----------------+
|             1|                1|
|             6|                1|
|             3|                1|
|             5|                1|
|             9|                1|
|             4|                1|
|             8|                1|
|             7|                1|
|            10|                1|
|             2|                1|
+--------------+-----------------+



**8. Find the Customer Who Spent the Most Money**

In [28]:
customer_spending = df.groupBy("customer_id").agg(sum("revenue").alias("total_spent")) # Replace 'actual_revenue' with 'revenue'
top_spender = customer_spending.orderBy(col("total_spent").desc()).first()
print("Top spender:", top_spender)

Top spender: Row(customer_id=103, total_spent=120)


**9. Calculate the Average Discount Given per Product Category**

In [29]:
average_discount_by_category = df.groupBy("category").agg(avg("discount_percentage").alias("average_discount"))
average_discount_by_category.show()

+--------------+----------------+
|      category|average_discount|
+--------------+----------------+
|       Fashion|            10.0|
|   Electronics|            8.75|
|         Books|             0.0|
|Home Appliance|            15.0|
+--------------+----------------+



**10. Create a New Column for Final Price After Discount**

In [30]:
df = df.withColumn("final_price", col("price") - (col("price") * col("discount_percentage") / 100))
df.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+-------------------+-------+-----------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|         transacti |revenue|final_price|
+--------------+-----------+------------+--------------+-----+--------+-------------------+-------------------+-------+-----------+
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|2023-08-01 00:00:00|  -9000|      900.0|
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|2023-08-01 00:00:00|  -5600|      665.0|
|             3|        103|       Shirt|       Fashion|   40|       3|                  0|2023-08-02 00:00:00|    120|       40.0|
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|2023-08-03 00:00:00|  -2100|      127.5|
|             5|        101|  Headphones|   Electronics|  100|       2|     

# Banking Transactions

In [31]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, when, count

In [33]:
df = spark.read.csv("banking_transactions.csv", header=True, inferSchema=True)
df.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             2|        202|      Withdrawal|  2000|      2023-09-01|
|             3|        203|         Deposit|  3000|      2023-09-02|
|             4|        201|      Withdrawal|  1500|      2023-09-02|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             6|        205|      Withdrawal|   500|      2023-09-03|
|             7|        202|         Deposit|  2500|      2023-09-04|
|             8|        206|      Withdrawal|   700|      2023-09-04|
|             9|        203|         Deposit|  4000|      2023-09-05|
|            10|        204|      Withdrawal|  3000|     2023-09-05 |
+--------------+-----------+----------------+------+----------------+



**1. Calculate the Total Deposit and Withdrawal Amounts**

In [34]:
total_amounts = df.groupBy("transaction_type").agg(sum("amount").alias("total_amount"))
total_amounts.show()

+----------------+------------+
|transaction_type|total_amount|
+----------------+------------+
|         Deposit|       24500|
|      Withdrawal|        7700|
+----------------+------------+



**2. Filter Transactions Greater Than $3,000**

In [35]:
filtered_transactions = df.filter(col("amount") > 3000)
filtered_transactions.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             9|        203|         Deposit|  4000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



**3. Find the Largest Deposit Made**

In [36]:
largest_deposit = df.filter(col("transaction_type") == "Deposit").orderBy(col("amount").desc()).first()
print("Largest Deposit:", largest_deposit)


Largest Deposit: Row(transaction_id=5, customer_id=204, transaction_type='Deposit', amount=10000, transaction_date='2023-09-03')


**4. Calculate the Average Transaction Amount for Each Transaction Type**

In [37]:
average_amounts = df.groupBy("transaction_type").agg(avg("amount").alias("average_amount"))
average_amounts.show()

+----------------+--------------+
|transaction_type|average_amount|
+----------------+--------------+
|         Deposit|        4900.0|
|      Withdrawal|        1540.0|
+----------------+--------------+



**5. Find Customers Who Made Both Deposits and Withdrawals**

In [39]:
from pyspark.sql.functions import col, countDistinct, when
customers_with_both = df.groupBy("customer_id").agg(
    countDistinct(when(col("transaction_type") == "Deposit", 1)).alias("deposit_count"),
    countDistinct(when(col("transaction_type") == "Withdrawal", 1)).alias("withdrawal_count")
).filter((col("deposit_count") > 0) & (col("withdrawal_count") > 0))
customers_with_both.show()



+-----------+-------------+----------------+
|customer_id|deposit_count|withdrawal_count|
+-----------+-------------+----------------+
|        202|            1|               1|
|        204|            1|               1|
|        201|            1|               1|
+-----------+-------------+----------------+



**6. Calculate the Total Amount of Transactions per Day**

In [40]:
total_amount_per_day = df.groupBy("transaction_date").agg(sum("amount").alias("total_amount"))
total_amount_per_day.show()



+----------------+------------+
|transaction_date|total_amount|
+----------------+------------+
|      2023-09-01|        7000|
|      2023-09-02|        4500|
|      2023-09-05|        4000|
|     2023-09-05 |        3000|
|      2023-09-04|        3200|
|      2023-09-03|       10500|
+----------------+------------+



**7. Find the Customer with the Highest Total Withdrawal**

In [41]:

highest_withdrawal_customer = df.filter(col("transaction_type") == "Withdrawal").groupBy("customer_id").agg(sum("amount").alias("total_withdrawal")).orderBy(col("total_withdrawal").desc()).first()
print("Customer with Highest Total Withdrawal:", highest_withdrawal_customer)



Customer with Highest Total Withdrawal: Row(customer_id=204, total_withdrawal=3000)


**8. Calculate the Number of Transactions for Each Customer**

In [42]:

transaction_count_per_customer = df.groupBy("customer_id").agg(count("*").alias("transaction_count"))
transaction_count_per_customer.show()



+-----------+-----------------+
|customer_id|transaction_count|
+-----------+-----------------+
|        206|                1|
|        205|                1|
|        202|                2|
|        203|                2|
|        204|                2|
|        201|                2|
+-----------+-----------------+



**9. Find All Transactions That Occurred on the Same Day as a Withdrawal Greater Than $1,000**

In [44]:
withdrawal_greater_than_1000 = df.filter((col("transaction_type") == "Withdrawal") & (col("amount") > 1000))
joined_df = df.join(withdrawal_greater_than_1000, df.transaction_date == withdrawal_greater_than_1000.transaction_date, "inner")
joined_df.show()

+--------------+-----------+----------------+------+----------------+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|             2|        202|      Withdrawal|  2000|      2023-09-01|
|             2|        202|      Withdrawal|  2000|      2023-09-01|             2|        202|      Withdrawal|  2000|      2023-09-01|
|             3|        203|         Deposit|  3000|      2023-09-02|             4|        201|      Withdrawal|  1500|      2023-09-02|
|             4|        201|      Withdrawal|  1500|      2023-09-02|             4|        201|      Withdrawal|  1500|      2023-09-02|
|            10|        204|      

**10. Create a New Column to Classify Transactions as "High" or "Low" Value**

In [45]:
df = df.withColumn("transaction_value", when(col("amount") > 5000, "High").otherwise("Low"))
df.show()

+--------------+-----------+----------------+------+----------------+-----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|transaction_value|
+--------------+-----------+----------------+------+----------------+-----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|              Low|
|             2|        202|      Withdrawal|  2000|      2023-09-01|              Low|
|             3|        203|         Deposit|  3000|      2023-09-02|              Low|
|             4|        201|      Withdrawal|  1500|      2023-09-02|              Low|
|             5|        204|         Deposit| 10000|      2023-09-03|             High|
|             6|        205|      Withdrawal|   500|      2023-09-03|              Low|
|             7|        202|         Deposit|  2500|      2023-09-04|              Low|
|             8|        206|      Withdrawal|   700|      2023-09-04|              Low|
|             9|        203|    

# Health & Fitness Tracker Data

In [46]:
df = spark.read.csv("health_fitness_tracker_data.csv", header=True, inferSchema=True)
df.show()

+-------+----------+-----+---------------+--------------+------------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|
+-------+----------+-----+---------------+--------------+------------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|
|      2|2023-09-01| 8000|            400|           6.5|    Strength|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|
|      1|2023-09-02|10000|            450|           6.0|      Cardio|
|      2|2023-09-02| 9500|            500|           7.0|      Cardio|
|      3|2023-09-02|14000|            600|           7.5|    Strength|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|
|      3|2023-09-03|16000|            700|           7.0|     Cardio |
+-------+----------+-----+---------------+--------------+------------+



**1. Find the Total Steps Taken by Each User**

In [47]:
total_steps_per_user = df.groupBy("user_id").agg(sum("steps").alias("total_steps"))
total_steps_per_user.show()

+-------+-----------+
|user_id|total_steps|
+-------+-----------+
|      1|      35000|
|      3|      45000|
|      2|      29500|
+-------+-----------+



**2. Filter Days with More Than 10,000 Steps**

In [48]:
filtered_data = df.filter(col("steps") > 10000)
filtered_data.show()

+-------+----------+-----+---------------+--------------+------------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|
+-------+----------+-----+---------------+--------------+------------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|
|      3|2023-09-02|14000|            600|           7.5|    Strength|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|
|      3|2023-09-03|16000|            700|           7.0|     Cardio |
+-------+----------+-----+---------------+--------------+------------+



**3. Calculate the Average Calories Burned by Workout Type**

In [49]:
average_calories_by_workout = df.groupBy("workout_type").agg(avg("calories_burned").alias("average_calories"))
average_calories_by_workout.show()

+------------+-----------------+
|workout_type| average_calories|
+------------+-----------------+
|    Strength|            500.0|
|        Yoga|573.3333333333334|
|      Cardio|483.3333333333333|
|     Cardio |            700.0|
+------------+-----------------+



**4. Identify the Day with the Most Steps for Each User**

In [60]:
average_calories_by_workout = df.groupBy("workout_type").agg(avg("calories_burned").alias("average_calories"))
average_calories_by_workout.show()

+------------+-----------------+
|workout_type| average_calories|
+------------+-----------------+
|    Strength|            500.0|
|        Yoga|573.3333333333334|
|      Cardio|483.3333333333333|
|     Cardio |            700.0|
+------------+-----------------+



**5. Find Users Who Burned More Than 600 Calories on Any Day**

In [54]:
users_burned_more_than_600 = df.filter(col("calories_burned") > 600).select("user_id").distinct()
users_burned_more_than_600.show()

+-------+
|user_id|
+-------+
|      3|
+-------+



**6. Calculate the Average Hours of Sleep per User**

In [55]:
average_sleep_per_user = df.groupBy("user_id").agg(avg("hours_of_sleep").alias("average_sleep"))
average_sleep_per_user.show()

+-------+-----------------+
|user_id|    average_sleep|
+-------+-----------------+
|      1|              7.0|
|      3|              7.5|
|      2|6.666666666666667|
+-------+-----------------+



**7. Find the Total Calories Burned per Day**

In [56]:
total_calories_per_day = df.groupBy("date").agg(sum("calories_burned").alias("total_calories"))
total_calories_per_day.show()

+----------+--------------+
|      date|total_calories|
+----------+--------------+
|2023-09-03|          1770|
|2023-09-01|          1550|
|2023-09-02|          1550|
+----------+--------------+



**8. Identify Users Who Did Different Types of Workouts**

In [57]:
users_with_multiple_workouts = df.groupBy("user_id").agg(countDistinct("workout_type").alias("workout_count")).filter(col("workout_count") > 1)
users_with_multiple_workouts.show()

+-------+-------------+
|user_id|workout_count|
+-------+-------------+
|      1|            2|
|      3|            3|
|      2|            3|
+-------+-------------+



**9. Calculate the Total Number of Workouts per User**

In [58]:
workout_count_per_user = df.groupBy("user_id").agg(count("*").alias("workout_count"))
workout_count_per_user.show()

+-------+-------------+
|user_id|workout_count|
+-------+-------------+
|      1|            3|
|      3|            3|
|      2|            3|
+-------+-------------+



**10. Create a New Column for "Active" Days**

In [59]:
df = df.withColumn("active_day", when(col("steps") > 10000, "Active").otherwise("Inactive"))
df.show()

+-------+----------+-----+---------------+--------------+------------+----------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|active_day|
+-------+----------+-----+---------------+--------------+------------+----------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|    Active|
|      2|2023-09-01| 8000|            400|           6.5|    Strength|  Inactive|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|    Active|
|      1|2023-09-02|10000|            450|           6.0|      Cardio|  Inactive|
|      2|2023-09-02| 9500|            500|           7.0|      Cardio|  Inactive|
|      3|2023-09-02|14000|            600|           7.5|    Strength|    Active|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|    Active|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|    Active|
|      3|2023-09-03|16000|            700|           7.0|     Cardio |    Active|
+-------+-------

# Music Streaming Data

In [61]:
df = spark.read.csv("music_streaming_data.csv", header=True, inferSchema=True)
df.show()

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      2|    Galway Girl|Ed Sheeran|             190|2023-09-02 10:00:00|Los Angeles|
|      3|      New Rules|  Dua Lipa|             230|2

**1. Calculate the Total Listening Time for Each User**

In [62]:
total_listening_time = df.groupBy("user_id").agg(sum("duration_seconds").alias("total_listening_time"))
total_listening_time.show()


+-------+--------------------+
|user_id|total_listening_time|
+-------+--------------------+
|      1|                 630|
|      3|                 610|
|      2|                 680|
+-------+--------------------+



**2. Filter Songs Streamed for More Than 200 Seconds**

In [63]:
filtered_songs = df.filter(col("duration_seconds") > 200)
filtered_songs.show()

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      3|      New Rules|  Dua Lipa|             230|2023-09-02 11:00:00|     London|
+-------+---------------+----------+----------------+-------------------+-----------+



**3. Find the Most Popular Artist (by Total Streams)**

In [64]:
most_popular_artist = df.groupBy("artist").agg(count("*").alias("stream_count")).orderBy(col("stream_count").desc()).first()
print("Most Popular Artist:", most_popular_artist)

Most Popular Artist: Row(artist='Dua Lipa', stream_count=3)


**4. Identify the Song with the Longest Duration**

In [65]:
longest_song = df.orderBy(col("duration_seconds").desc()).select("song_title", "artist", "duration_seconds").first()
print("Longest Song:", longest_song)

Longest Song: Row(song_title='Perfect', artist='Ed Sheeran', duration_seconds=250)


**5. Calculate the Average Song Duration by Artist**

In [66]:
average_duration_by_artist = df.groupBy("artist").agg(avg("duration_seconds").alias("average_duration"))
average_duration_by_artist.show()

+----------+------------------+
|    artist|  average_duration|
+----------+------------------+
|  Dua Lipa|203.33333333333334|
|Ed Sheeran|226.66666666666666|
|The Weeknd|             210.0|
+----------+------------------+



**6. Find the Top 3 Most Streamed Songs per User**

In [67]:
top_3_streamed_songs = df.groupBy("user_id").agg(count("*").alias("stream_count")).orderBy(col("stream_count").desc()).limit(3)
top_3_streamed_songs.show()

+-------+------------+
|user_id|stream_count|
+-------+------------+
|      1|           3|
|      3|           3|
|      2|           3|
+-------+------------+



**7. Calculate the Total Number of Streams per Day**

In [69]:
from pyspark.sql.functions import col, to_date
df = df.withColumn("streaming_date", to_date(col("streaming_time")))
total_streams_per_day = df.groupBy("streaming_date").agg(count("*").alias("total_streams"))
total_streams_per_day.show()

+--------------+-------------+
|streaming_date|total_streams|
+--------------+-------------+
|    2023-09-01|            5|
|    2023-09-02|            4|
+--------------+-------------+



**8. Identify Users Who Streamed Songs from More Than One Artist**

In [70]:
users_with_multiple_artists = df.groupBy("user_id").agg(countDistinct("artist").alias("artist_count")).filter(col("artist_count") > 1)
users_with_multiple_artists.show()

+-------+------------+
|user_id|artist_count|
+-------+------------+
+-------+------------+



**9. Calculate the Total Streams for Each Location**

In [71]:
total_streams_per_location = df.groupBy("location").agg(count("*").alias("total_streams"))
total_streams_per_location.show()

+-----------+-------------+
|   location|total_streams|
+-----------+-------------+
|Los Angeles|            3|
|     London|            3|
|   New York|            3|
+-----------+-------------+



**10. Create a New Column to Classify Long and Short Songs**

In [72]:
df = df.withColumn("song_length", when(col("duration_seconds") > 200, "Long").otherwise("Short"))
df.show()

+-------+---------------+----------+----------------+-------------------+-----------+--------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|streaming_date|song_length|
+-------+---------------+----------+----------------+-------------------+-----------+--------------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|    2023-09-01|      Short|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|    2023-09-01|       Long|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|    2023-09-01|      Short|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|    2023-09-01|       Long|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|    2023-09-01|       Long|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|    2023-09

# Retail Store Sales Data


In [74]:
df = spark.read.csv("retail_store_sales_data.csv", header=True, inferSchema=True)
df.show()

+--------------+------------+-----------+-----+--------+----------+
|transaction_id|product_name|   category|price|quantity|sales_date|
+--------------+------------+-----------+-----+--------+----------+
|             1|       Apple|  Groceries|  0.5|      10|2023-09-01|
|             2|     T-shirt|   Clothing| 15.0|       2|2023-09-01|
|             3|    Notebook| Stationery|  2.0|       5|2023-09-02|
|             4|      Banana|  Groceries|  0.3|      12|2023-09-02|
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|
|             6|       Pants|   Clothing| 25.0|       3|2023-09-03|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|
|             8|         Pen| Stationery|  1.0|      10|2023-09-04|
|             9|      Orange|  Groceries|  0.6|       8|2023-09-05|
|            10|    Sneakers|   Clothing| 50.0|       1|2023-09-05|
+--------------+------------+-----------+-----+--------+----------+



**1. Calculate the Total Revenue per Category**

In [75]:
total_revenue_per_category = df.groupBy("category").agg(sum(col("price") * col("quantity")).alias("total_revenue"))
total_revenue_per_category.show()

+-----------+------------------+
|   category|     total_revenue|
+-----------+------------------+
| Stationery|              20.0|
|  Groceries|13.399999999999999|
|Electronics|            1000.0|
|   Clothing|             155.0|
+-----------+------------------+



**2. Filter Transactions Where the Total Sales Amount is Greater Than $100**

In [76]:
filtered_transactions = df.filter(col("price") * col("quantity") > 100)
filtered_transactions.show()

+--------------+------------+-----------+-----+--------+----------+
|transaction_id|product_name|   category|price|quantity|sales_date|
+--------------+------------+-----------+-----+--------+----------+
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|
+--------------+------------+-----------+-----+--------+----------+



**3. Find the Most Sold Product**

In [77]:
most_sold_product = df.groupBy("product_name").agg(sum("quantity").alias("total_quantity")).orderBy(col("total_quantity").desc()).first()
print("Most Sold Product:", most_sold_product)

Most Sold Product: Row(product_name='Banana', total_quantity=12)


**4. Calculate the Average Price per Product Category**

In [78]:
average_price_per_category = df.groupBy("category").agg(avg("price").alias("average_price"))
average_price_per_category.show()

+-----------+------------------+
|   category|     average_price|
+-----------+------------------+
| Stationery|               1.5|
|  Groceries|0.4666666666666666|
|Electronics|             450.0|
|   Clothing|              30.0|
+-----------+------------------+



**5. Find the Top 3 Highest Grossing Products**

In [79]:
top_3_grossing_products = df.groupBy("product_name").agg(sum(col("price") * col("quantity")).alias("total_revenue")).orderBy(col("total_revenue").desc()).limit(3)
top_3_grossing_products.show()

+------------+-------------+
|product_name|total_revenue|
+------------+-------------+
|      Laptop|        800.0|
|  Headphones|        200.0|
|       Pants|         75.0|
+------------+-------------+



**6. Calculate the Total Number of Items Sold per Day**

In [80]:
total_items_sold_per_day = df.groupBy("sales_date").agg(sum("quantity").alias("total_quantity"))
total_items_sold_per_day.show()


+----------+--------------+
|sales_date|total_quantity|
+----------+--------------+
|2023-09-03|             4|
|2023-09-01|            12|
|2023-09-05|             9|
|2023-09-02|            17|
|2023-09-04|            12|
+----------+--------------+



**7. Identify the Product with the Lowest Price in Each Category**

In [82]:
from pyspark.sql.functions import min, col
lowest_price_product = df.groupBy("category").agg(min("price").alias("lowest_price"))
lowest_price_product.show()


+-----------+------------+
|   category|lowest_price|
+-----------+------------+
| Stationery|         1.0|
|  Groceries|         0.3|
|Electronics|       100.0|
|   Clothing|        15.0|
+-----------+------------+



**8. Calculate the Total Revenue for Each Product**

In [83]:
total_revenue_per_product = df.groupBy("product_name").agg(sum(col("price") * col("quantity")).alias("total_revenue"))
total_revenue_per_product.show()

+------------+------------------+
|product_name|     total_revenue|
+------------+------------------+
|     T-shirt|              30.0|
|    Sneakers|              50.0|
|      Orange|               4.8|
|      Banana|3.5999999999999996|
|         Pen|              10.0|
|       Pants|              75.0|
|      Laptop|             800.0|
|    Notebook|              10.0|
|       Apple|               5.0|
|  Headphones|             200.0|
+------------+------------------+



**9. Find the Total Sales per Day for Each Category**

In [84]:
total_sales_per_day_per_category = df.groupBy("sales_date", "category").agg(sum(col("price") * col("quantity")).alias("total_sales"))
total_sales_per_day_per_category.show()

+----------+-----------+------------------+
|sales_date|   category|       total_sales|
+----------+-----------+------------------+
|2023-09-03|Electronics|             800.0|
|2023-09-01|  Groceries|               5.0|
|2023-09-01|   Clothing|              30.0|
|2023-09-02| Stationery|              10.0|
|2023-09-04| Stationery|              10.0|
|2023-09-02|  Groceries|3.5999999999999996|
|2023-09-05|  Groceries|               4.8|
|2023-09-05|   Clothing|              50.0|
|2023-09-03|   Clothing|              75.0|
|2023-09-04|Electronics|             200.0|
+----------+-----------+------------------+



**10. Create a New Column for Discounted Price**

In [85]:
df = df.withColumn("discounted_price", col("price") * 0.9)
df.show()

+--------------+------------+-----------+-----+--------+----------+----------------+
|transaction_id|product_name|   category|price|quantity|sales_date|discounted_price|
+--------------+------------+-----------+-----+--------+----------+----------------+
|             1|       Apple|  Groceries|  0.5|      10|2023-09-01|            0.45|
|             2|     T-shirt|   Clothing| 15.0|       2|2023-09-01|            13.5|
|             3|    Notebook| Stationery|  2.0|       5|2023-09-02|             1.8|
|             4|      Banana|  Groceries|  0.3|      12|2023-09-02|            0.27|
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|           720.0|
|             6|       Pants|   Clothing| 25.0|       3|2023-09-03|            22.5|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|            90.0|
|             8|         Pen| Stationery|  1.0|      10|2023-09-04|             0.9|
|             9|      Orange|  Groceries|  0.6|       8|2023-09-0