<a href="https://colab.research.google.com/github/urmilapol/urmilapolprojects/blob/master/pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://www.chaosgenius.io/blog/apache-spark-architecture/


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import upper

spark = SparkSession.builder.appName("DataTransformation").getOrCreate()

# Sample data: courses with fees and discounts
data = [("Java", 4000, 5), ("Python", 4600, 10), ("Scala", 4100, 15)]
columns = ["CourseName", "fee", "discount"]
df = spark.createDataFrame(data, columns)
df.show(truncate=False)


+----------+----+--------+
|CourseName|fee |discount|
+----------+----+--------+
|Java      |4000|5       |
|Python    |4600|10      |
|Scala     |4100|15      |
+----------+----+--------+



In [2]:
def to_upper(df):
    return df.withColumn("CourseName", upper(df.CourseName))

def reduce_price(df, amount):
    return df.withColumn("new_fee", df.fee - amount)

def apply_discount(df):
    return df.withColumn("discounted_fee", df.new_fee * (1 - df.discount / 100))

# Apply chain
result = df.transform(to_upper).transform(reduce_price, 1000).transform(apply_discount)
result.select("CourseName", "discounted_fee").show()


+----------+--------------+
|CourseName|discounted_fee|
+----------+--------------+
|      JAVA|        2850.0|
|    PYTHON|        3240.0|
|     SCALA|        2635.0|
+----------+--------------+

