In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as _sum, avg

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Basic Aggregations Example") \
    .getOrCreate()

# Sample data
data = [
    ("John Doe", "Sales", 1000),
    ("Jane Smith", "Marketing", 1500),
    ("Emily Davis", "Sales", 1200),
    ("Michael Brown", "HR", 1100),
    ("Linda Green", "Sales", 900)
]

# Define schema
columns = ["name", "department", "salary"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show initial DataFrame
print("Initial DataFrame:")
df.show()

# Perform basic aggregations
# 1. Calculate total salary
total_salary = df.agg(_sum("salary").alias("total_salary")).collect()[0]["total_salary"]
print(f"Total Salary: {total_salary}")

# 2. Calculate average salary
average_salary = df.agg(avg("salary").alias("average_salary")).collect()[0]["average_salary"]
print(f"Average Salary: {average_salary}")

# 3. Calculate total and average salary by department
print("Total and Average Salary by Department:")
department_aggregations = df.groupBy("department") \
    .agg(
        _sum("salary").alias("total_salary"),
        avg("salary").alias("average_salary")
    )

department_aggregations.show()

# Stop the SparkSession





Initial DataFrame:
+-------------+----------+------+
|         name|department|salary|
+-------------+----------+------+
|     John Doe|     Sales|  1000|
|   Jane Smith| Marketing|  1500|
|  Emily Davis|     Sales|  1200|
|Michael Brown|        HR|  1100|
|  Linda Green|     Sales|   900|
+-------------+----------+------+

Total Salary: 5700
Average Salary: 1140.0
Total and Average Salary by Department:
+----------+------------+------------------+
|department|total_salary|    average_salary|
+----------+------------+------------------+
|     Sales|        3100|1033.3333333333333|
| Marketing|        1500|            1500.0|
|        HR|        1100|            1100.0|
+----------+------------+------------------+



In [4]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Write DataFrame to CSV Example") \
    .getOrCreate()

# Sample data
data = [
    (1, "John Doe", "Sales", 1000),
    (2, "Jane Smith", "Marketing", 1500),
    (3, "Emily Davis", "Sales", 1200),
    (4, "Michael Brown", "HR", 1100)
]

# Define schema
columns = ["id", "name", "department", "salary"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show the DataFrame
print("DataFrame to be written to CSV:")
df.show()

# Define the path where the CSV file will be saved
csv_path = "output/employees"

# Write DataFrame to CSV
df.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv(csv_path)

print(f"DataFrame has been written to {csv_path}")

# Stop the SparkSession
spark.stop()


DataFrame to be written to CSV:
+---+-------------+----------+------+
| id|         name|department|salary|
+---+-------------+----------+------+
|  1|     John Doe|     Sales|  1000|
|  2|   Jane Smith| Marketing|  1500|
|  3|  Emily Davis|     Sales|  1200|
|  4|Michael Brown|        HR|  1100|
+---+-------------+----------+------+

DataFrame has been written to output/employees


In [5]:
spark.stop()