# Lab 3: Simple PySpark Programs

In [2]:
import pyspark
import pandas as pd

1. Applying Transformations (Filter and withColumn) on a DataFrame

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("TransformationsExample").getOrCreate()

# Sample data
data = [(1, "Alice", 29), (2, "Bob", 35), (3, "Carol", 30), (4, "David", 25)]
df = spark.createDataFrame(data, ["id", "name", "age"])

# Apply transformations
# Filter rows where age is greater than 28
filtered_df = df.filter(col("age") > 28)

# Add a new column 'age_plus_ten' which is 'age' + 10
transformed_df = filtered_df.withColumn("age_plus_ten", col("age") + 10)

# Show the result
transformed_df.show()

# Stop the Spark session
spark.stop()


+---+-----+---+------------+
| id| name|age|age_plus_ten|
+---+-----+---+------------+
|  1|Alice| 29|          39|
|  2|  Bob| 35|          45|
|  3|Carol| 30|          40|
+---+-----+---+------------+



2. Performing Actions (count and show) on a DataFrame

In [4]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ActionsExample").getOrCreate()

# Sample data
data = [(1, "Alice"), (2, "Bob"), (3, "Carol")]
df = spark.createDataFrame(data, ["id", "name"])

# Perform actions
# Count the number of rows
count = df.count()
print(f"Number of rows: {count}")

# Show the DataFrame
df.show()

# Stop the Spark session
spark.stop()


Number of rows: 3
+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
|  3|Carol|
+---+-----+



3. Performing Basic Aggregations (e.g., Sum, Average) on a DataFrame

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, avg

# Initialize Spark session
spark = SparkSession.builder.appName("AggregationsExample").getOrCreate()

# Sample data
data = [(1, 10.0), (2, 20.0), (3, 30.0)]
df = spark.createDataFrame(data, ["id", "value"])

# Perform basic aggregations
# Calculate sum and average of the 'value' column
aggregations = df.agg(
    sum("value").alias("total_value"),
    avg("value").alias("average_value")
)

# Show the results
aggregations.show()

# Stop the Spark session
spark.stop()


+-----------+-------------+
|total_value|average_value|
+-----------+-------------+
|       60.0|         20.0|
+-----------+-------------+



4. Writing a PySpark DataFrame to a CSV File

In [8]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("WriteSingleCSV").getOrCreate()

# Sample data
data = [(1, "Alice", 29), (2, "Bob", 35), (3, "Carol", 30)]
df = spark.createDataFrame(data, ["id", "name", "age"])

# Path to write the CSV file
csv_file_path = "l3q4.csv"

# Coalesce the DataFrame to a single partition
single_partition_df = df.coalesce(1)

# Write DataFrame to CSV file
single_partition_df.write.csv(csv_file_path, header=True, mode="overwrite")

# Stop the Spark session
spark.stop()



                                                                                

5. Word Count Program in PySpark

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col

# Initialize Spark session
spark = SparkSession.builder.appName("WordCount").getOrCreate()

# Sample data
data = [("Hello world",), ("Hello PySpark",), ("Word count example",)]
df = spark.createDataFrame(data, ["text"])

# Perform word count
# Split the text into words and explode into separate rows
words_df = df.withColumn("word", explode(split(col("text"), " ")))

# Group by word and count occurrences
word_counts = words_df.groupBy("word").count()

# Show the word counts
word_counts.show()

# Stop the Spark session
spark.stop()


+-------+-----+
|   word|count|
+-------+-----+
|  Hello|    2|
|  world|    1|
|PySpark|    1|
|example|    1|
|  count|    1|
|   Word|    1|
+-------+-----+



Alternate Method(s):

In [1]:
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf

In [11]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("WordCount") \
    .master("local[*]") \
    .getOrCreate()

# Get the SparkContext from SparkSession
sc = spark.sparkContext

# Load the text file
lines = sc.textFile("sample.txt")

# Perform the word count
counts = lines.flatMap(lambda line: line.split(" ")) \
              .map(lambda word: (word, 1)) \
              .reduceByKey(lambda x, y: x + y)

# Collect and print the results
output = counts.collect()
for (word, count) in output:
    print(f"{word}:{count}")

# Stop the SparkSession
spark.stop()


24/08/10 16:14:39 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:76)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:53)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:484)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.command

banana:5000
cherry:5000
elderberry:5000
honeydew:5000
apple:5000
date:5000
fig:5000
grape:5000


In [15]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

# Create a SparkSession with configuration
spark = SparkSession.builder \
    .appName("WordCount") \
    .master("local[*]") \
    .getOrCreate()

# Load the text file
lines = spark.read.text("sample.txt")

# Split the lines into words and perform word count
words = lines.withColumn('word', f.explode(f.split(f.col('value'), ' '))) \
    .groupBy('word') \
    .count() \
    .sort(f.col('count').desc()) \
    .show()

# Stop the SparkSession
spark.stop()


24/08/10 16:19:01 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:76)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:53)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:484)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.command

+----------+-----+
|      word|count|
+----------+-----+
|     grape| 5000|
|elderberry| 5000|
|     apple| 5000|
|    cherry| 5000|
|    banana| 5000|
|      date| 5000|
|  honeydew| 5000|
|       fig| 5000|
+----------+-----+

