In [1]:
from pyspark.sql import SparkSession
import pandas as pd

In [4]:
from pyspark.sql.functions import sum,col,greatest,regexp_replace,expr

In [3]:
spark = SparkSession.builder.appName("skill").getOrCreate()

In [33]:
# Top-N Elements Using orderBy and limit
# Dataset: Sales data containing sale_id, product_name, and sales. (sale_id will be unique and product_name might be repeated for different sale_id’s)
# Task: Find the top 5 products with the highest sales.
# Visualization: Create a bar chart in Tableau showing the top 5 products and their sales.
data = [
    (1, "Titanium Cartridge", 2500),
    (2, "Hand Grenade", 1800),
    (3, "AK47", 1200),
    (4, "Timer", 900),
    (5, "RDX", 800),
    (6, "Battery", 600),
    (7, "AK47", 400),
    (8, "Insulated Wire", 1500),
    (9, "RDX", 700),
    (10, "Glock", 1300),
    (11, "Kevlar", 300),
    (12, "Mine", 1000),
    (13, "Helmet", 400),
    (14, "Gloves", 300),
    (15, "Paper", 1300),
    (16, "RDX", 900),
    (17, "Sulphur", 1000),
    (18, "Mine", 500)
]

cols = ["ssale_id", "product_name","sales"]
data_df = spark.createDataFrame(data,cols)
result = (
    data_df.groupBy("product_name")
    .sum("sales")
    .withColumnRenamed("sum(sales)","total_sales")
    .orderBy(col("total_sales").desc())
    .limit(5)
)

In [34]:
result.show()

+------------------+-----------+
|      product_name|total_sales|
+------------------+-----------+
|Titanium Cartridge|       2500|
|               RDX|       2400|
|      Hand Grenade|       1800|
|              AK47|       1600|
|              Mine|       1500|
+------------------+-----------+



In [41]:
# Using greatest to Find Maximum Value Across Columns
# Dataset: Employee performance with columns employee_id, quarter1, quarter2, quarter3, quarter4.
# Task: Add a column showing the maximum quarterly score for each employee using the greatest function.
# Visualization: Use Tableau to show the maximum score per employee in a heatmap.
import random
data = [(i, random.randint(50, 100), random.randint(50, 100), random.randint(50, 100), random.randint(50, 100)) for i in range(1, 21)]
columns = ["employee_id", "quarter1", "quarter2", "quarter3", "quarter4"]
data_df = spark.createDataFrame(data,columns)
result = data_df.withColumn("max_score",greatest("quarter1","quarter2","quarter3","quarter4"))
result.show()

+-----------+--------+--------+--------+--------+---------+
|employee_id|quarter1|quarter2|quarter3|quarter4|max_score|
+-----------+--------+--------+--------+--------+---------+
|          1|      98|      64|      98|      73|       98|
|          2|      64|      80|      66|      53|       80|
|          3|     100|      76|      85|      80|      100|
|          4|      98|      52|      65|      78|       98|
|          5|      80|      65|      55|      53|       80|
|          6|      67|      90|      76|      87|       90|
|          7|      76|      92|      84|      91|       92|
|          8|      66|      96|      90|      83|       96|
|          9|      65|      78|      50|      65|       78|
|         10|      94|      50|      92|      84|       94|
|         11|      87|      95|      66|      59|       95|
|         12|      78|      68|      69|      70|       78|
|         13|      89|      53|      99|      66|       99|
|         14|      62|      64|      84|

In [6]:
import random

In [10]:
data = [(i, random.randint(50, 500), f"{random.uniform(5, 30):.2f}%") for i in range(1, 21)]
columns = ["product_id", "price", "discount_percent"]

data_df = spark.createDataFrame(data,columns)

data_df = data_df.withColumn("discount_percent",regexp_replace("discount_percent","%",""))

result = data_df.withColumn("final_price",expr("price - (price*discount_percent/100)"))

result.show()

+----------+-----+----------------+------------------+
|product_id|price|discount_percent|       final_price|
+----------+-----+----------------+------------------+
|         1|  202|           22.25|           157.055|
|         2|  283|           17.30|           234.041|
|         3|  212|           16.48|          177.0624|
|         4|  402|            8.85|           366.423|
|         5|  330|           16.84|           274.428|
|         6|   66|           13.19|           57.2946|
|         7|  420|           14.77|           357.966|
|         8|  169|            6.67|          157.7277|
|         9|  426|           28.56|          304.3344|
|        10|  121|           16.64|          100.8656|
|        11|   51|            7.13|           47.3637|
|        12|  193|           16.42|161.30939999999998|
|        13|  240|           26.50|             176.4|
|        14|  103|           13.44|           89.1568|
|        15|  127|           23.42| 97.25659999999999|
|        1

In [11]:
data = [
    (f"Product_{random.randint(1, 10)}", f"Region_{random.randint(1, 5)}", i, random.randint(1, 5), f"Comment_{i}")
    for i in range(1, 101)
]
columns = ["product", "region", "feedback_id", "rating", "comments"]


data_df = spark.createDataFrame(data, columns)
result = data_df.filter(col("rating").isin(4,5))
scalar = result.count()
result.show()
print(scalar)

+----------+--------+-----------+------+----------+
|   product|  region|feedback_id|rating|  comments|
+----------+--------+-----------+------+----------+
| Product_5|Region_3|          2|     4| Comment_2|
| Product_7|Region_3|          5|     5| Comment_5|
|Product_10|Region_5|          7|     5| Comment_7|
| Product_1|Region_4|          8|     5| Comment_8|
| Product_7|Region_1|         11|     4|Comment_11|
| Product_6|Region_5|         12|     4|Comment_12|
| Product_7|Region_2|         13|     4|Comment_13|
| Product_2|Region_2|         14|     5|Comment_14|
| Product_4|Region_1|         18|     5|Comment_18|
| Product_9|Region_3|         22|     5|Comment_22|
| Product_2|Region_3|         23|     5|Comment_23|
| Product_1|Region_5|         31|     4|Comment_31|
| Product_6|Region_2|         32|     5|Comment_32|
| Product_5|Region_3|         34|     4|Comment_34|
| Product_2|Region_2|         37|     4|Comment_37|
|Product_10|Region_2|         38|     5|Comment_38|
| Product_8|

In [12]:
data = [
    (f"Product_{random.randint(1, 5)}", (datetime(2024, 1, 1) + timedelta(days=random.randint(0, 30))).strftime('%Y-%m-%d'), random.randint(100, 500))
    for _ in range(100)
]
columns = ["product_id", "date", "sales"]
data_df = spark.createDataFrame(data, columns)

window_spec = Window.partitionBy("product_id").orderBy("date").rowsBetween(Window.unboundedPreceding,Window.currentRow)

result = data_df.withColumn("running_total", sum("sales").over(window_spec))


result.show()

NameError: name 'datetime' is not defined

In [13]:
9

9

In [14]:
#10
data = [
    (1, "2024-12-01", 150.50),
    (2, "2024-12-02", 50.00),
    (1, "2024-12-03", 200.75),
    (3, "2024-12-04", 300.20),
    (2, "2024-12-05", 120.00),
    (3, "2024-12-06", 95.00)
]


columns = ["customer_id", "order_date", "amount"]


data_df = spark.createDataFrame(data, columns)


data_df.createOrReplaceTempView("customer_orders")

query = """
SELECT customer_id , AVG(amount) from  customer_orders group by customer_id having AVG(amount)>100
"""



In [16]:
# 11
# result = data_df.groupBy("store_id").agg(sum("sales").alias("total_sales"))

In [None]:
#12
data = [
    ("HR", "Manager", 1),
    ("HR", "Executive", 2),
    ("IT", "Manager", 3),
    ("IT", "Developer", 4),
    ("Finance", "Analyst", 5),
    ("Finance", "Manager", 6),
    ("IT", "Developer", 7),
    ("HR", "Executive", 8)
]


columns = ["department", "designation", "employee_id"]


data_df = spark.createDataFrame(data, columns)


result = data_df.groupBy("department", "designation").agg(count("employee_id").alias("employee_count"))


result.show()

In [None]:
#13
data = [
    (1, "North", 150.50),
    (2, "South", 200.75),
    (3, "North", 100.00),
    (4, "East", 250.30),
    (5, "West", 300.00),
    (6, "South", 180.20)
]


columns = ["customer_id", "region", "purchase_amount"]


data_df = spark.createDataFrame(data, columns)


result = data_df.groupBy("region").agg(avg("purchase_amount").alias("average_purchase_amount"))


result.show()

In [17]:
# 14

data_df = spark.createDataFrame(data, columns)


result = data_df.groupBy("category").agg(
    sum("sales").alias("total_sales"),
    avg("sales").alias("average_sales")
)

