In [6]:

# Create a Spark session

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType


# Create a Spark session
spark = SparkSession.builder.appName("PySparkDemo").getOrCreate()

In [7]:
schema = StructType([
StructField("Name", StringType(), True),
StructField("Age", IntegerType(), True),
StructField("Gender", StringType(), True),
StructField("Salary", IntegerType(), True)
])




In [8]:
# Create a DataFrame
data = [
("Alice", 28, "Female", 60000),
("Bob", 35, "Male", 75000),
("Charlie", 22, "Male", 50000),
("Diana", 31, "Female", 80000)
]
df = spark.createDataFrame(data, schema=schema)

In [9]:
# Show the DataFrame
print("Sample data:")
df.show()


# Perform transformations
filtered_data = df.filter(df["Age"] > 25)
grouped_data = filtered_data.groupBy("Gender").agg({"Salary": "avg", "Age": "max"})


# Show transformed data
print("Transformed data:")
grouped_data.show()

Sample data:


                                                                                

+-------+---+------+------+
|   Name|Age|Gender|Salary|
+-------+---+------+------+
|  Alice| 28|Female| 60000|
|    Bob| 35|  Male| 75000|
|Charlie| 22|  Male| 50000|
|  Diana| 31|Female| 80000|
+-------+---+------+------+

Transformed data:




+------+-----------+--------+
|Gender|avg(Salary)|max(Age)|
+------+-----------+--------+
|Female|    70000.0|      31|
|  Male|    75000.0|      35|
+------+-----------+--------+



                                                                                