**Coding challenge**

In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=5ff3ba1e71c521e99c8f588f80d3de19f0d6cbd95d93acbc176de054e7b513d8
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession

# **E-commerce Transactions**





In [None]:
spark = SparkSession.builder.appName("EcommerceAnalysis").getOrCreate()
csv_file_path = "/content/sample_data/Ecommerce_Transaction.csv"

Ecommerce_df = spark.read.format("csv").option("header", "true").load(csv_file_path)
Ecommerce_df.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|      2023-08-01|
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|      2023-08-01|
|             3|        103|       Shirt|       Fashion|   40|       3|                  0|      2023-08-02|
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|
|             5|        101|  Headphones|   Electronics|  100|       2|                 10|      2023-08-03|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|      2023-08-04|
|             7|   

In [None]:
# 1. Calculate the Total Revenue per Category
from pyspark.sql.functions import col, sum

Ecommerce_df = Ecommerce_df.withColumn("revenue", col("price") * col("quantity") * (1 - col("discount_percentage") / 100))
total_revenue_per_category = Ecommerce_df.groupBy("category").agg(sum("revenue").alias("total_revenue"))
total_revenue_per_category.show()


+--------------+-------------+
|      category|total_revenue|
+--------------+-------------+
|       Fashion|        168.0|
|   Electronics|       2950.0|
|         Books|         80.0|
|Home Appliance|        756.0|
+--------------+-------------+



In [None]:
2. # Filter Transactions with a Discount Greater Than 10%
discounted_transactions = Ecommerce_df.filter(col("discount_percentage") > 10)
discounted_transactions.show()


+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|revenue|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|  127.5|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|      2023-08-04|   48.0|
|             7|        106|Refrigerator|Home Appliance|  800|       1|                 25|      2023-08-05|  600.0|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+



In [None]:
# 3. Find the Most Expensive Product Sold
most_expensive_product = Ecommerce_df.orderBy(col("price").desc()).select("product", "price").limit(1)

most_expensive_product.show()


+------------+-----+
|     product|price|
+------------+-----+
|Refrigerator|  800|
+------------+-----+



In [None]:
# 4. Calculate the Average Quantity of Products Sold per Category
from pyspark.sql.functions import avg

avg_quantity_per_category = Ecommerce_df.groupBy("category").agg(avg("quantity").alias("avg_quantity"))
avg_quantity_per_category.show()


+--------------+------------+
|      category|avg_quantity|
+--------------+------------+
|       Fashion|         2.0|
|   Electronics|        1.75|
|         Books|         4.0|
|Home Appliance|         1.0|
+--------------+------------+



In [None]:
# 5. Identify Customers Who Purchased More Than One Product
multiple_products = Ecommerce_df.filter(col("quantity") > 1).select("customer_id", "transaction_id").distinct()
multiple_products.show()


+-----------+--------------+
|customer_id|transaction_id|
+-----------+--------------+
|        101|             5|
|        103|             3|
|        107|             8|
|        102|             2|
|        102|            10|
+-----------+--------------+



In [None]:
 # 6. Find the Top 3 Highest Revenue Transactions
top_transactions = Ecommerce_df.groupBy("transaction_id").agg(sum("revenue").alias("total_revenue"))
top_3_transactions = top_transactions.orderBy(col("total_revenue").desc()).limit(3)
top_3_transactions.show()


+--------------+-------------+
|transaction_id|total_revenue|
+--------------+-------------+
|             2|       1330.0|
|             1|        900.0|
|             7|        600.0|
+--------------+-------------+



In [None]:
# 7. Calculate the Total Number of Transactions per Day
from pyspark.sql.functions import count

transactions_per_day = Ecommerce_df.groupBy("transaction_date").agg(count("transaction_id").alias("total_transactions"))
transactions_per_day.show()


+----------------+------------------+
|transaction_date|total_transactions|
+----------------+------------------+
|      2023-08-06|                 2|
|      2023-08-04|                 1|
|      2023-08-05|                 2|
|      2023-08-01|                 2|
|      2023-08-02|                 1|
|      2023-08-03|                 2|
+----------------+------------------+



In [None]:
# 8. Find the Customer Who Spent the Most Money
total_spent_per_customer = Ecommerce_df.groupBy("customer_id").agg(sum("revenue").alias("total_spent"))
highest_spending_customer = total_spent_per_customer.orderBy("total_spent", ascending = False).limit(1)
highest_spending_customer.show()


+-----------+-----------+
|customer_id|total_spent|
+-----------+-----------+
|        102|     1870.0|
+-----------+-----------+



In [None]:
#9. Calculate the Average Discount Given per Product Category
avgdiscount_per_category = Ecommerce_df.groupBy("category").agg(avg("discount_percentage").alias("avg_discount"))
avgdiscount_per_category.show()


+--------------+------------+
|      category|avg_discount|
+--------------+------------+
|       Fashion|        10.0|
|   Electronics|        8.75|
|         Books|         0.0|
|Home Appliance|        15.0|
+--------------+------------+



In [None]:
# 10 . Create a New Column for Final Price After Discount
Ecommerce_df = Ecommerce_df.withColumn("final_price", col("price") * (1 - col("discount_percentage") / 100))

Ecommerce_df.show()


+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+-----------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|revenue|final_price|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+-----------+
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|      2023-08-01|  900.0|      900.0|
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|      2023-08-01| 1330.0|      665.0|
|             3|        103|       Shirt|       Fashion|   40|       3|                  0|      2023-08-02|  120.0|       40.0|
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|  127.5|      127.5|
|             5|        101|  Headphones|   Electronics|  100|       2|                 10|      

# **Banking Transactions**

In [None]:
spark = SparkSession.builder.appName("BankingTransactionAnalysis").getOrCreate()
csv_file_path = "/content/sample_data/BankingTransaction.csv"

BT_df = spark.read.format("csv").option("header", "true").load(csv_file_path)
BT_df.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             2|        202|      Withdrawal|  2000|      2023-09-01|
|             3|        203|         Deposit|  3000|      2023-09-02|
|             4|        201|      Withdrawal|  1500|      2023-09-02|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             6|        205|      Withdrawal|   500|      2023-09-03|
|             7|        202|         Deposit|  2500|      2023-09-04|
|             8|        206|      Withdrawal|   700|      2023-09-04|
|             9|        203|         Deposit|  4000|      2023-09-05|
|            10|        204|      Withdrawal|  3000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



In [None]:
# 1.Calculate the Total Deposit and Withdrawal Amounts
from pyspark.sql.functions import sum

total_amounts = BT_df.groupBy("transaction_type").agg(sum("amount").alias("total_amount"))
total_amounts.show()


+----------------+------------+
|transaction_type|total_amount|
+----------------+------------+
|         Deposit|     24500.0|
|      Withdrawal|      7700.0|
+----------------+------------+



In [None]:
# 2.Filter Transactions Greater Than $3,000

greater_transactions = BT_df.filter(col("amount") > 3000)
greater_transactions.show()


+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             9|        203|         Deposit|  4000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



In [None]:
# 3. Find the Largest Deposit Made
largest_deposit = BT_df.filter(col("transaction_type") == "Deposit").orderBy(col("amount")).limit(1)
largest_deposit.show()


+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             5|        204|         Deposit| 10000|      2023-09-03|
+--------------+-----------+----------------+------+----------------+



In [None]:
# 4. Calculate the Average Transaction Amount for Each Transaction Type
avg_amount_pertype = BT_df.groupBy("transaction_type").agg(avg("amount").alias("average_amount"))
avg_amount_pertype.show()


+----------------+--------------+
|transaction_type|average_amount|
+----------------+--------------+
|         Deposit|        4900.0|
|      Withdrawal|        1540.0|
+----------------+--------------+



In [None]:
# 5. Find Customers Who Made Both Deposits and Withdrawals

deposit_customers = BT_df.filter(col("transaction_type") == "Deposit").select("customer_id").distinct()
withdrawal_customers = BT_df.filter(col("transaction_type") == "Withdrawal").select("customer_id").distinct()
both_deposit_withdrawal = deposit_customers.intersect(withdrawal_customers)
both_deposit_withdrawal.show()


+-----------+
|customer_id|
+-----------+
|        202|
|        201|
|        204|
+-----------+



In [None]:
# 6. Calculate the Total Amount of Transactions per Day
totalamount_perday = BT_df.groupBy("transaction_date").agg(sum("amount").alias("total_amount"))
totalamount_perday.show()



+----------------+------------+
|transaction_date|total_amount|
+----------------+------------+
|      2023-09-01|      7000.0|
|      2023-09-02|      4500.0|
|      2023-09-05|      7000.0|
|      2023-09-04|      3200.0|
|      2023-09-03|     10500.0|
+----------------+------------+



In [None]:
# 7. Find the Customer with the Highest Total Withdrawal
total_withdrawals_per_customer = BT_df.filter(col("transaction_type") == "Withdrawal").groupBy("customer_id").agg(sum("amount").alias("total_withdrawn")) \
                                           .orderBy(col("total_withdrawn").desc()).limit(1)
total_withdrawals_per_customer.show()


+-----------+---------------+
|customer_id|total_withdrawn|
+-----------+---------------+
|        204|         3000.0|
+-----------+---------------+



In [None]:
# 8. Calculate the Number of Transactions for Each Customer

transactions_per_customer = BT_df.groupBy("customer_id").agg(count("transaction_id").alias("transaction_count"))
transactions_per_customer.show()


+-----------+-----------------+
|customer_id|transaction_count|
+-----------+-----------------+
|        205|                1|
|        203|                2|
|        202|                2|
|        206|                1|
|        201|                2|
|        204|                2|
+-----------+-----------------+



In [None]:
# 9. Find All Transactions That Occurred on the Same Day as a Withdrawal Greater Than $1,000
withdrawals_above1000 = BT_df.filter((col("transaction_type") == "Withdrawal") & (col("amount") > 1000)).select("transaction_date").distinct()

transactions_on_sameday = BT_df.join(withdrawals_above1000, "transaction_date")
transactions_on_sameday.show()



+----------------+--------------+-----------+----------------+------+
|transaction_date|transaction_id|customer_id|transaction_type|amount|
+----------------+--------------+-----------+----------------+------+
|      2023-09-01|             1|        201|         Deposit|  5000|
|      2023-09-01|             2|        202|      Withdrawal|  2000|
|      2023-09-02|             3|        203|         Deposit|  3000|
|      2023-09-02|             4|        201|      Withdrawal|  1500|
|      2023-09-05|             9|        203|         Deposit|  4000|
|      2023-09-05|            10|        204|      Withdrawal|  3000|
+----------------+--------------+-----------+----------------+------+



In [None]:
# 10. Create a New Column to Classify Transactions as "High" or "Low" Value
from pyspark.sql.functions import when
BT_df = BT_df.withColumn("transaction_value", when(col("amount") > 5000, "High").otherwise("Low"))
BT_df.show()


+--------------+-----------+----------------+------+----------------+-----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|transaction_value|
+--------------+-----------+----------------+------+----------------+-----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|              Low|
|             2|        202|      Withdrawal|  2000|      2023-09-01|              Low|
|             3|        203|         Deposit|  3000|      2023-09-02|              Low|
|             4|        201|      Withdrawal|  1500|      2023-09-02|              Low|
|             5|        204|         Deposit| 10000|      2023-09-03|             High|
|             6|        205|      Withdrawal|   500|      2023-09-03|              Low|
|             7|        202|         Deposit|  2500|      2023-09-04|              Low|
|             8|        206|      Withdrawal|   700|      2023-09-04|              Low|
|             9|        203|    

# **Health & Fitness Tracker Data**

In [None]:
spark = SparkSession.builder.appName("HealthandFitnessAnalysis").getOrCreate()
csv_file_path = "/content/sample_data/HealthandFitnessTracker.csv"

Health_Fitness_df = spark.read.format("csv").option("header", "true").load(csv_file_path)

Health_Fitness_df.show()

+-------+----------+-----+---------------+--------------+------------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|
+-------+----------+-----+---------------+--------------+------------+
|      1|2023-09-01|12000|            500|             7|      Cardio|
|      2|2023-09-01| 8000|            400|           6.5|    Strength|
|      3|2023-09-01|15000|            650|             8|        Yoga|
|      1|2023-09-02|10000|            450|             6|      Cardio|
|      2|2023-09-02| 9500|            500|             7|      Cardio|
|      3|2023-09-02|14000|            600|           7.5|    Strength|
|      1|2023-09-03|13000|            550|             8|        Yoga|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|
|      3|2023-09-03|16000|            700|             7|      Cardio|
+-------+----------+-----+---------------+--------------+------------+



In [None]:
# 1. Find the Total Steps Taken by Each User
total_steps_per_user = Health_Fitness_df.groupBy("user_id").agg(sum("steps").alias("total_steps"))
total_steps_per_user.show()


+-------+-----------+
|user_id|total_steps|
+-------+-----------+
|      3|    45000.0|
|      1|    35000.0|
|      2|    29500.0|
+-------+-----------+



In [None]:
# 2. Filter Days with More Than 10,000 Steps
more_steps_days = Health_Fitness_df.filter(col("steps") > 10000)
more_steps_days.show()


+-------+----------+-----+---------------+--------------+------------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|
+-------+----------+-----+---------------+--------------+------------+
|      1|2023-09-01|12000|            500|             7|      Cardio|
|      3|2023-09-01|15000|            650|             8|        Yoga|
|      3|2023-09-02|14000|            600|           7.5|    Strength|
|      1|2023-09-03|13000|            550|             8|        Yoga|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|
|      3|2023-09-03|16000|            700|             7|      Cardio|
+-------+----------+-----+---------------+--------------+------------+



In [None]:
# 3. Calculate the Average Calories Burned by Workout Type
avg_calories_burned = Health_Fitness_df.groupBy("workout_type").agg(avg("calories_burned").alias("average_calories"))
avg_calories_burned.show()


+------------+-----------------+
|workout_type| average_calories|
+------------+-----------------+
|    Strength|            500.0|
|        Yoga|573.3333333333334|
|      Cardio|            537.5|
+------------+-----------------+



In [None]:
# 4. Identify the Day with the Most Steps for Each User
from pyspark.sql.functions import max
from pyspark.sql import Window
most_steps_day = Health_Fitness_df.withColumn("max_steps",max("steps").over(Window.partitionBy("user_id"))) .filter(col("steps") == col("max_steps"))

most_steps_day.show()




+-------+----------+-----+---------------+--------------+------------+---------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|max_steps|
+-------+----------+-----+---------------+--------------+------------+---------+
|      1|2023-09-03|13000|            550|             8|        Yoga|    13000|
|      2|2023-09-02| 9500|            500|             7|      Cardio|     9500|
|      3|2023-09-03|16000|            700|             7|      Cardio|    16000|
+-------+----------+-----+---------------+--------------+------------+---------+



In [None]:
# 5. Find Users Who Burned More Than 600 Calories on Any Day
high_calorie_burners = Health_Fitness_df.filter(col("calories_burned") > 600).select("user_id").distinct()
high_calorie_burners.show()


+-------+
|user_id|
+-------+
|      3|
+-------+



In [None]:
# 6. Calculate the Average Hours of Sleep per User
from pyspark.sql.functions import round
avg_sleep_peruser = Health_Fitness_df.groupBy("user_id").agg(round(avg("hours_of_sleep") , 2).alias("average_sleep"))
avg_sleep_peruser.show()


+-------+-------------+
|user_id|average_sleep|
+-------+-------------+
|      3|          7.5|
|      1|          7.0|
|      2|         6.67|
+-------+-------------+



In [None]:
# 7. Find the Total Calories Burned per Day

total_calories_perday = Health_Fitness_df.groupBy("date").agg(sum("calories_burned").alias("total_calories"))
total_calories_perday.show()



+----------+--------------+
|      date|total_calories|
+----------+--------------+
|2023-09-01|        1550.0|
|2023-09-02|        1550.0|
|2023-09-03|        1770.0|
+----------+--------------+



In [None]:
# 8. Identify Users Who Did Different Types of Workouts
from pyspark.sql.functions import countDistinct
multiple_workouts = Health_Fitness_df.groupBy("user_id").agg(countDistinct("workout_type").alias("distinct_workouts")).filter(col("distinct_workouts") > 1)
multiple_workouts.show()


+-------+-----------------+
|user_id|distinct_workouts|
+-------+-----------------+
|      3|                3|
|      1|                2|
|      2|                3|
+-------+-----------------+



In [None]:
# 9. Calculate the Total Number of Workouts per User
workouts_peruser = Health_Fitness_df.groupBy("user_id").agg(count("workout_type").alias("total_workouts"))
workouts_peruser.show()


+-------+--------------+
|user_id|total_workouts|
+-------+--------------+
|      3|             3|
|      1|             3|
|      2|             3|
+-------+--------------+



In [None]:
# 10. Create a New Column for "Active" Days
from pyspark.sql.functions import when
Health_Fitness_df = Health_Fitness_df.withColumn("active_day", when(col("steps") > 10000, "Active").otherwise("Inactive"))
Health_Fitness_df.show()


+-------+----------+-----+---------------+--------------+------------+----------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|active_day|
+-------+----------+-----+---------------+--------------+------------+----------+
|      1|2023-09-01|12000|            500|             7|      Cardio|    Active|
|      2|2023-09-01| 8000|            400|           6.5|    Strength|  Inactive|
|      3|2023-09-01|15000|            650|             8|        Yoga|    Active|
|      1|2023-09-02|10000|            450|             6|      Cardio|  Inactive|
|      2|2023-09-02| 9500|            500|             7|      Cardio|  Inactive|
|      3|2023-09-02|14000|            600|           7.5|    Strength|    Active|
|      1|2023-09-03|13000|            550|             8|        Yoga|    Active|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|    Active|
|      3|2023-09-03|16000|            700|             7|      Cardio|    Active|
+-------+-------

# **Music Streaming Data**

In [None]:
spark = SparkSession.builder.appName("MusicStreamingAnalysis").getOrCreate()
csv_file_path = "/content/sample_data/MusicStreaming.csv"
Music_df = spark.read.format("csv").option("header", "true").load(csv_file_path)

Music_df.show()

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      2|    Galway Girl|Ed Sheeran|             190|2023-09-02 10:00:00|Los Angeles|
|      3|      New Rules|  Dua Lipa|             230|2

In [None]:
# 1. Calculate the Total Listening Time for Each User

total_listening_time = Music_df.groupBy("user_id").agg(sum("duration_seconds").alias("total_time_seconds"))

total_listening_time.show()


+-------+------------------+
|user_id|total_time_seconds|
+-------+------------------+
|      3|             610.0|
|      1|             630.0|
|      2|             680.0|
+-------+------------------+



In [None]:
# Filter Songs Streamed for More Than 200 Seconds
longest_streaming = Music_df.filter(col("duration_seconds") > 200)
longest_streaming.show()


+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      3|      New Rules|  Dua Lipa|             230|2023-09-02 11:00:00|     London|
+-------+---------------+----------+----------------+-------------------+-----------+



In [None]:
# 3. Find the Most Popular Artist (by Total Streams)
popular_artist = Music_df.groupBy("artist").agg(count("song_title").alias("total_streams")).orderBy(col("total_streams").desc()).limit(1)

popular_artist.show()



+--------+-------------+
|  artist|total_streams|
+--------+-------------+
|Dua Lipa|            3|
+--------+-------------+



In [None]:
# 4. Identify the Song with the Longest Duration
longest_song = Music_df.orderBy(col("duration_seconds").desc()).limit(1)

longest_song.show()


+-------+----------+----------+----------------+-------------------+-----------+
|user_id|song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+----------+----------+----------------+-------------------+-----------+
|      2|   Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
+-------+----------+----------+----------------+-------------------+-----------+



In [None]:
# 5. Calculate the Average Song Duration by Artist
avgduration_by_artist = Music_df.groupBy("artist").agg(round(avg("duration_seconds"), 2).alias("average_duration"))

avgduration_by_artist.show()


+----------+----------------+
|    artist|average_duration|
+----------+----------------+
|  Dua Lipa|          203.33|
|Ed Sheeran|          226.67|
|The Weeknd|           210.0|
+----------+----------------+



In [None]:
# 6. Find the Top 3 Most Streamed Songs per User

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec = Window.partitionBy("user_id").orderBy(col("duration_seconds").desc())

top_3_songs_per_user = Music_df.withColumn("rank", row_number().over(window_spec)).filter(col("rank") <= 3).drop("rank")

top_3_songs_per_user.show()


+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      2|    Galway Girl|Ed Sheeran|             190|2023-09-02 10:00:00|Los Angeles|
|      3|      New Rules|  Dua Lipa|             230|2023-09-02 11:00:00|     London|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|
|      3|     Levitating|  Dua Lipa|             180|2

In [None]:
# 7. Calculate the Total Number of Streams per Day
from pyspark.sql.functions import to_date

streams_per_day = Music_df.withColumn("date", to_date(col("streaming_time"))).groupBy("date").agg(count("song_title").alias("total_streams"))

streams_per_day.show()



+----------+-------------+
|      date|total_streams|
+----------+-------------+
|2023-09-01|            5|
|2023-09-02|            4|
+----------+-------------+



In [None]:
# 8. Identify Users Who Streamed Songs from More Than One Artist
distinct_artists_per_user = Music_df.groupBy("user_id").agg(countDistinct("artist").alias("distinct_artists")).filter(col("distinct_artists") > 1)

distinct_artists_per_user.show()


+-------+----------------+
|user_id|distinct_artists|
+-------+----------------+
+-------+----------------+



In [None]:
# 9. Calculate the Total Streams for Each Location
streams_per_location = Music_df.groupBy("location") .agg(count("song_title").alias("total_streams"))

streams_per_location.show()


+-----------+-------------+
|   location|total_streams|
+-----------+-------------+
|Los Angeles|            3|
|     London|            3|
|   New York|            3|
+-----------+-------------+



In [None]:
# 10. Create a New Column to Classify Long and Short Songs

Music_df = Music_df.withColumn("song_length",when(col("duration_seconds") > 200, "Long").otherwise("Short"))

Music_df.show()



+-------+---------------+----------+----------------+-------------------+-----------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|song_length|
+-------+---------------+----------+----------------+-------------------+-----------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|      Short|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|       Long|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|      Short|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|       Long|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|       Long|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|      Short|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|       Long|
|      2|    Galway 

# **Retail Store Sales Data**

In [70]:
spark = SparkSession.builder.appName("RetailStoreSalesAnalysis").getOrCreate()
csv_file_path = "/content/sample_data/RetailStore.csv"
RS_Sales_df = spark.read.format("csv").option("header", "true").load(csv_file_path)

RS_Sales_df.show()

+--------------+------------+-----------+------+--------+----------+
|transaction_id|product_name|   category| price|quantity|sales_date|
+--------------+------------+-----------+------+--------+----------+
|             1|       Apple|  Groceries|  0.50|      10|2023-09-01|
|             2|     T-shirt|   Clothing| 15.00|       2|2023-09-01|
|             3|    Notebook| Stationery|  2.00|       5|2023-09-02|
|             4|      Banana|  Groceries|  0.30|      12|2023-09-02|
|             5|      Laptop|Electronics|800.00|       1|2023-09-03|
|             6|       Pants|   Clothing| 25.00|       3|2023-09-03|
|             7|  Headphones|Electronics|100.00|       2|2023-09-04|
|             8|         Pen| Stationery|  1.00|      10|2023-09-04|
|             9|      Orange|  Groceries|  0.60|       8|2023-09-05|
|            10|    Sneakers|   Clothing| 50.00|       1|2023-09-05|
+--------------+------------+-----------+------+--------+----------+



In [72]:
# 1. Calculate the Total Revenue per Category


total_revenue_percategory = RS_Sales_df.withColumn("total_revenue", col("price") * col("quantity")).groupBy("category").agg(round(sum("total_revenue"), 2).alias("total_revenue"))
total_revenue_percategory.show()


+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
| Stationery|         20.0|
|  Groceries|         13.4|
|Electronics|       1000.0|
|   Clothing|        155.0|
+-----------+-------------+



In [73]:
# 2. Filter Transactions Where the Total Sales Amount is Greater Than $100

high_transactions = RS_Sales_df.withColumn("total_sales_amount", col("price") * col("quantity")).filter(col("total_sales_amount") > 100)
high_transactions.show()


+--------------+------------+-----------+------+--------+----------+------------------+
|transaction_id|product_name|   category| price|quantity|sales_date|total_sales_amount|
+--------------+------------+-----------+------+--------+----------+------------------+
|             5|      Laptop|Electronics|800.00|       1|2023-09-03|             800.0|
|             7|  Headphones|Electronics|100.00|       2|2023-09-04|             200.0|
+--------------+------------+-----------+------+--------+----------+------------------+



In [74]:
# 3. Find the Most Sold Product

most_sold_product = RS_Sales_df.groupBy("product_name").agg(sum(col("quantity")).alias("total_quantity_sold")).orderBy(col("total_quantity_sold").desc()).limit(1)
most_sold_product.show()


+------------+-------------------+
|product_name|total_quantity_sold|
+------------+-------------------+
|      Banana|               12.0|
+------------+-------------------+



In [76]:
# 4. Calculate the Average Price per Product Category
from pyspark.sql.functions import col, avg

avgprice_per_category = RS_Sales_df.groupBy("category").agg(round(avg("price"), 2).alias("average_price"))
avgprice_per_category.show()


+-----------+-------------+
|   category|average_price|
+-----------+-------------+
| Stationery|          1.5|
|  Groceries|         0.47|
|Electronics|        450.0|
|   Clothing|         30.0|
+-----------+-------------+



In [77]:
# 5. Find the Top 3 Highest Grossing Products

top_3_products = RS_Sales_df.withColumn("revenue", col("price") * col("quantity")).groupBy("product_name") \
                                      .agg(sum("revenue").alias("total_revenue")).orderBy(col("total_revenue").desc()).limit(3)
top_3_products.show()


+------------+-------------+
|product_name|total_revenue|
+------------+-------------+
|      Laptop|        800.0|
|  Headphones|        200.0|
|       Pants|         75.0|
+------------+-------------+



In [78]:
# 6. Calculate the Total Number of Items Sold per Day

items_sold_per_day = RS_Sales_df.groupBy("sales_date").agg(sum("quantity").alias("total_items_sold"))
items_sold_per_day.show()


+----------+----------------+
|sales_date|total_items_sold|
+----------+----------------+
|2023-09-01|            12.0|
|2023-09-02|            17.0|
|2023-09-05|             9.0|
|2023-09-04|            12.0|
|2023-09-03|             4.0|
+----------+----------------+



In [79]:
# 7. Identify the Product with the Lowest Price in Each Category
window_spec = Window.partitionBy("category").orderBy(col("price"))
lowest_price_product = RS_Sales_df.withColumn("rank", row_number().over(window_spec)).filter(col("rank") == 1).drop("rank")

lowest_price_product.show()


+--------------+------------+-----------+------+--------+----------+
|transaction_id|product_name|   category| price|quantity|sales_date|
+--------------+------------+-----------+------+--------+----------+
|             2|     T-shirt|   Clothing| 15.00|       2|2023-09-01|
|             7|  Headphones|Electronics|100.00|       2|2023-09-04|
|             4|      Banana|  Groceries|  0.30|      12|2023-09-02|
|             8|         Pen| Stationery|  1.00|      10|2023-09-04|
+--------------+------------+-----------+------+--------+----------+



In [81]:
# 8. Calculate the Total Revenue for Each Product

revenue_per_product = RS_Sales_df.withColumn("total_revenue", col("price") * col("quantity")).groupBy("product_name") .agg(round(sum("total_revenue"), 2).alias("total_revenue"))
revenue_per_product.show()


+------------+-------------+
|product_name|total_revenue|
+------------+-------------+
|     T-shirt|         30.0|
|    Sneakers|         50.0|
|      Orange|          4.8|
|      Banana|          3.6|
|         Pen|         10.0|
|       Pants|         75.0|
|      Laptop|        800.0|
|    Notebook|         10.0|
|       Apple|          5.0|
|  Headphones|        200.0|
+------------+-------------+



In [82]:
# 9. Find the Total Sales per Day for Each Category

sales_per_day_bycategory = RS_Sales_df.withColumn("total_sales_amount", col("price") * col("quantity")).groupBy("sales_date", "category") \
                                         .agg(round(sum("total_sales_amount"), 2).alias("total_sales"))
sales_per_day_bycategory.show()


+----------+-----------+-----------+
|sales_date|   category|total_sales|
+----------+-----------+-----------+
|2023-09-01|  Groceries|        5.0|
|2023-09-02|  Groceries|        3.6|
|2023-09-01|   Clothing|       30.0|
|2023-09-05|  Groceries|        4.8|
|2023-09-04| Stationery|       10.0|
|2023-09-02| Stationery|       10.0|
|2023-09-03|Electronics|      800.0|
|2023-09-04|Electronics|      200.0|
|2023-09-03|   Clothing|       75.0|
|2023-09-05|   Clothing|       50.0|
+----------+-----------+-----------+



In [83]:
# 10. Create a New Column for Discounted Price

RS_Sales_df = RS_Sales_df.withColumn("discounted_price", col("price") * 0.9)
RS_Sales_df.show()


+--------------+------------+-----------+------+--------+----------+----------------+
|transaction_id|product_name|   category| price|quantity|sales_date|discounted_price|
+--------------+------------+-----------+------+--------+----------+----------------+
|             1|       Apple|  Groceries|  0.50|      10|2023-09-01|            0.45|
|             2|     T-shirt|   Clothing| 15.00|       2|2023-09-01|            13.5|
|             3|    Notebook| Stationery|  2.00|       5|2023-09-02|             1.8|
|             4|      Banana|  Groceries|  0.30|      12|2023-09-02|            0.27|
|             5|      Laptop|Electronics|800.00|       1|2023-09-03|           720.0|
|             6|       Pants|   Clothing| 25.00|       3|2023-09-03|            22.5|
|             7|  Headphones|Electronics|100.00|       2|2023-09-04|            90.0|
|             8|         Pen| Stationery|  1.00|      10|2023-09-04|             0.9|
|             9|      Orange|  Groceries|  0.60|      