In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

In [3]:
# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("hello") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/10 18:47:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Load data into a DataFrame
data = [
    {"name": "Alice", "age": 25, "city": "New York"},
    {"name": "Bob", "age": 17, "city": "San Francisco"},
    {"name": "Charlie", "age": 35, "city": "Los Angeles"},
    {"name": "David", "age": 15, "city": "Chicago"},
    {"name": "Eve", "age": 29, "city": "Miami"}
]


In [5]:
df = spark.createDataFrame(data)

# Show the original data
print("Original Data:")
df.show()

Original Data:
+---+-------------+-------+
|age|         city|   name|
+---+-------------+-------+
| 25|     New York|  Alice|
| 17|San Francisco|    Bob|
| 35|  Los Angeles|Charlie|
| 15|      Chicago|  David|
| 29|        Miami|    Eve|
+---+-------------+-------+



In [6]:
# Filter out rows where age is less than 18
adults_df = df.filter(df.age >= 18)

# Show filtered data
print("Adults Data:")
adults_df.show()

Adults Data:
+---+-----------+-------+
|age|       city|   name|
+---+-----------+-------+
| 25|   New York|  Alice|
| 35|Los Angeles|Charlie|
| 29|      Miami|    Eve|
+---+-----------+-------+



In [7]:
# Calculate the average age of adults
average_age = adults_df.agg(avg("age")).first()[0]
print(f"Average Age of Adults: {average_age}")

Average Age of Adults: 29.666666666666668


In [8]:
# Additional transformation: select specific columns and sort by age
result_df = adults_df.select("name", "age", "city").orderBy("age", ascending=False)


In [9]:
# Show the transformed data
print("Transformed Data:")
result_df.show()


Transformed Data:
+-------+---+-----------+
|   name|age|       city|
+-------+---+-----------+
|Charlie| 35|Los Angeles|
|    Eve| 29|      Miami|
|  Alice| 25|   New York|
+-------+---+-----------+



In [10]:
# Stop the SparkSession
spark.stop()