# Lab 2: Introduction to Basic PySpark Programs

In [2]:
import pyspark
import pandas as pd

1. Squaring a Set of Integers

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("SquareIntegers").getOrCreate()

# Create a DataFrame with a set of integers
data = [(1,), (2,), (3,), (4,), (5,)]
df = spark.createDataFrame(data, ["number"])

# Square the integers
# squared_df = df.withColumn("squared", col("number") ** 2)
squared_df = df.select('number',(col('number') ** 2).alias('squared'))

# Show the result
squared_df.show()

# Stop the Spark session
spark.stop()

+------+-------+
|number|squared|
+------+-------+
|     1|    1.0|
|     2|    4.0|
|     3|    9.0|
|     4|   16.0|
|     5|   25.0|
+------+-------+



For simple operations like squaring a number, using built-in functions is usually preferred due to better performance and simplicity. UDFs are more useful for complex operations where built-in functions cannot achieve the desired result.

If you need to use a UDF due to custom logic, be mindful of the potential performance overhead and test the performance impact if working with large datasets.

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# Initialize Spark session
spark = SparkSession.builder.appName("UDFExample").getOrCreate()

# Sample data
data = [(1,), (2,), (3,), (4,), (5,)]
df = spark.createDataFrame(data, ["number"])

# Define the Python function
def square(x):
    return x * x

# Convert the Python function into a UDF
square_udf = udf(lambda z: square(z), IntegerType())

# Apply the UDF to create a new column with squared values
# squared_df = df.withColumn("squared", square_udf("number"))
squared_df = df.select('number',square_udf('number').alias('squared'))

# Show the result
squared_df.show()

# Stop the Spark session
spark.stop()


+------+-------+
|number|squared|
+------+-------+
|     1|      1|
|     2|      4|
|     3|      9|
|     4|     16|
|     5|     25|
+------+-------+



2. Finding the Maximum of a Given Set of Numbers

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.functions import max

# Initialize Spark session
spark = SparkSession.builder.appName("FindMaximum").getOrCreate()

# Create a DataFrame with a set of numbers
data = [(1,), (5,), (3,), (4,), (2,)]
df = spark.createDataFrame(data, ["number"])

# Find the maximum number
# max_value = df.agg(max("number").alias("max_number")).collect()[0]["max_number"]
max_value = df.select("number").rdd.max()[0]

# Print the result
print(f"The maximum value is: {max_value}")

# Stop the Spark session
spark.stop()

The maximum value is: 5


3. Finding the Average of N Numbers

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

# Initialize Spark session
spark = SparkSession.builder.appName("FindAverage").getOrCreate()

# Create a DataFrame with a set of numbers
data = [(1,), (2,), (3,), (4,), (5,)]
df = spark.createDataFrame(data, ["number"])

# Calculate the average
average_value = df.agg(avg("number").alias("average_number")).collect()[0]["average_number"]

# Print the result
print(f"The average value is: {average_value}")

# Stop the Spark session
spark.stop()


The average value is: 3.0


4. Reading a CSV File into a PySpark DataFrame

In [12]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ReadCSV").getOrCreate()

# Path to the CSV file
csv_file_path = "test1.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Show the DataFrame
df.show()

# Stop the Spark session
spark.stop()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



5. Displaying the First Few Rows and Schema of a DataFrame

In [13]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("DisplayRowsAndSchema").getOrCreate()

# Create a DataFrame with sample data
data = [(1, "Alice"), (2, "Bob"), (3, "Carol")]
df = spark.createDataFrame(data, ["id", "name"])

# Display the first few rows
df.show()

# Display the schema of the DataFrame
df.printSchema()

# Stop the Spark session
spark.stop()


+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
|  3|Carol|
+---+-----+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



6. Calculating Basic Summary Statistics for a Specific Column

In [23]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("SummaryStatistics").getOrCreate()

# Create a DataFrame with sample data
data = [(1, 10.0), (2, 20.0), (3, 30.0)]
df = spark.createDataFrame(data, ["id", "value"])

# Calculate basic summary statistics for the 'value' column
# summary_stats = df.describe("value")
summary_stats = df.summary()

# Show the summary statistics
summary_stats.show()

# Stop the Spark session
spark.stop()


+-------+---+-----+
|summary| id|value|
+-------+---+-----+
|  count|  3|    3|
|   mean|2.0| 20.0|
| stddev|1.0| 10.0|
|    min|  1| 10.0|
|    25%|  1| 10.0|
|    50%|  2| 20.0|
|    75%|  3| 30.0|
|    max|  3| 30.0|
+-------+---+-----+

