# Installation

In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j


import os
import sys

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
[33m0% [Waiting for headers] [1 InRelease 14.2 kB/129 kB 11%] [Connected to cloud.r[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [1 InRelease 43.1 kB/129 kB 33%] [Connected to cloud.r-project.org (108.157.[0m                                                                               Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
[33m0% [3 InRelease 15.6 kB/128 kB 12%] [1 InRelease 43.1 kB/129 kB 33%] [Connected[0m[33m0% [3 InRelease 54.7 kB/128 kB 43%] [Waiting for headers] [Waiting for headers][0m                                                                               Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
[33m0% [3 InRelease 60.5 kB/128 kB 47%] [Waiting for headers] [Waiting for headers][0m

In [None]:
from pyspark.sql import SparkSession

In [None]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("DataFrame Tutorial") \
    .master("local[*]") \
    .getOrCreate()

#DataFrames
Apache Spark DataFrames are distributed collections of data organized into named columns. They are conceptually equivalent to tables in a relational database or DataFrames in Python's pandas library, but with the added benefit of being distributed across a cluster for big data processing.

#Creating DataFrames


1. From a list of data

In [None]:
# Create DataFrame from a list
data = [("John", 25, "New York"),
        ("Jane", 30, "San Francisco"),
        ("Mike", 35, "Chicago")]

columns = ["name", "age", "city"]
df = spark.createDataFrame(data, columns)
df.show()

+----+---+-------------+
|name|age|         city|
+----+---+-------------+
|John| 25|     New York|
|Jane| 30|San Francisco|
|Mike| 35|      Chicago|
+----+---+-------------+



2. Using a schema

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Define schema
schema = StructType([
    StructField("name", StringType(), nullable=False),
    StructField("age", IntegerType(), nullable=False),
    StructField("city", StringType(), nullable=True)
])

# Create DataFrame with schema
df = spark.createDataFrame(data, schema)
df.show()

+----+---+-------------+
|name|age|         city|
+----+---+-------------+
|John| 25|     New York|
|Jane| 30|San Francisco|
|Mike| 35|      Chicago|
+----+---+-------------+



3. From external data sources


```
# CSV
df_csv = spark.read.csv("path/to/file.csv", header=True, inferSchema=True)

# JSON
df_json = spark.read.json("path/to/file.json")

# Parquet
df_parquet = spark.read.parquet("path/to/file.parquet")

# Database
df_jdbc = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/database") \
    .option("dbtable", "table_name") \
    .option("user", "username") \
    .option("password", "password") \
    .load()
```



#Basic DataFrame Operations
1. Viewing Data

In [None]:
# Display the first few rows
df.show()

# Display the first n rows
df.show(5)

# Show the schema - similar to pandas df.dtypes
df.printSchema()

# Get basic statistics - similar to pandas df.describe()
df.describe().show()

+----+---+-------------+
|name|age|         city|
+----+---+-------------+
|John| 25|     New York|
|Jane| 30|San Francisco|
|Mike| 35|      Chicago|
+----+---+-------------+

+----+---+-------------+
|name|age|         city|
+----+---+-------------+
|John| 25|     New York|
|Jane| 30|San Francisco|
|Mike| 35|      Chicago|
+----+---+-------------+

root
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)
 |-- city: string (nullable = true)

+-------+----+----+-------------+
|summary|name| age|         city|
+-------+----+----+-------------+
|  count|   3|   3|            3|
|   mean|NULL|30.0|         NULL|
| stddev|NULL| 5.0|         NULL|
|    min|Jane|  25|      Chicago|
|    max|Mike|  35|San Francisco|
+-------+----+----+-------------+



2. Selecting Columns

In [None]:
from pyspark.sql import functions as F

# Select a single column
df.select("name").show()

# Select multiple columns
df.select("name", "age").show()

# Select with expressions
df.select(F.col("name"), F.col("age") + 1).show()

+----+
|name|
+----+
|John|
|Jane|
|Mike|
+----+

+----+---+
|name|age|
+----+---+
|John| 25|
|Jane| 30|
|Mike| 35|
+----+---+

+----+---------+
|name|(age + 1)|
+----+---------+
|John|       26|
|Jane|       31|
|Mike|       36|
+----+---------+



3. Filtering Data

In [None]:
# Filter by condition
df.filter(df.age > 25).show()

# Multiple conditions
df.filter((df.age > 25) & (df.city == "Chicago")).show()

# Using SQL expression
df.filter("age > 25 AND city = 'Chicago'").show()

+----+---+-------------+
|name|age|         city|
+----+---+-------------+
|Jane| 30|San Francisco|
|Mike| 35|      Chicago|
+----+---+-------------+

+----+---+-------+
|name|age|   city|
+----+---+-------+
|Mike| 35|Chicago|
+----+---+-------+

+----+---+-------+
|name|age|   city|
+----+---+-------+
|Mike| 35|Chicago|
+----+---+-------+



4. Adding and Modifying Columns

In [None]:
# Add a new column
df = df.withColumn("age_plus_ten", df.age + 10)
df.show()

# Rename a column
df = df.withColumnRenamed("age", "years_old")
df.show()

# Drop a column
df = df.drop("age_plus_ten")
df.show()

+----+---+-------------+------------+
|name|age|         city|age_plus_ten|
+----+---+-------------+------------+
|John| 25|     New York|          35|
|Jane| 30|San Francisco|          40|
|Mike| 35|      Chicago|          45|
+----+---+-------------+------------+

+----+---------+-------------+------------+
|name|years_old|         city|age_plus_ten|
+----+---------+-------------+------------+
|John|       25|     New York|          35|
|Jane|       30|San Francisco|          40|
|Mike|       35|      Chicago|          45|
+----+---------+-------------+------------+

+----+---------+-------------+
|name|years_old|         city|
+----+---------+-------------+
|John|       25|     New York|
|Jane|       30|San Francisco|
|Mike|       35|      Chicago|
+----+---------+-------------+



#Advanced DataFrame Operations

5. Grouping and Aggregation

In [None]:
# Group by and count
df.groupBy("city").count().show()

+-------------+-----+
|         city|count|
+-------------+-----+
|     New York|    1|
|San Francisco|    1|
|      Chicago|    1|
+-------------+-----+



In [None]:
df.groupBy("city").agg(
    F.count("*").alias("count"),
    F.avg("years_old").alias("avg_age"),
    F.min("years_old").alias("min_age"),
    F.max("years_old").alias("max_age")
).show()

+-------------+-----+-------+-------+-------+
|         city|count|avg_age|min_age|max_age|
+-------------+-----+-------+-------+-------+
|     New York|    1|   25.0|     25|     25|
|San Francisco|    1|   30.0|     30|     30|
|      Chicago|    1|   35.0|     35|     35|
+-------------+-----+-------+-------+-------+



6. Joining DataFrames

In [None]:
# Create another DataFrame
employee_data = [
    (1, "John", "Engineering"),
    (2, "Jane", "Marketing"),
    (3, "Mike", "Sales")
]
employee_df = spark.createDataFrame(employee_data, ["id", "name", "department"])

salary_data = [
    (1, 70000),
    (2, 80000),
    (3, 65000)
]
salary_df = spark.createDataFrame(salary_data, ["id", "salary"])

# Inner Join
employee_df.join(salary_df, "id").show()

# Left Join
employee_df.join(salary_df, "id", "left").show()

# Right Join
employee_df.join(salary_df, "id", "right").show()

# Full Outer Join
employee_df.join(salary_df, "id", "outer").show()

+---+----+-----------+------+
| id|name| department|salary|
+---+----+-----------+------+
|  1|John|Engineering| 70000|
|  2|Jane|  Marketing| 80000|
|  3|Mike|      Sales| 65000|
+---+----+-----------+------+

+---+----+-----------+------+
| id|name| department|salary|
+---+----+-----------+------+
|  1|John|Engineering| 70000|
|  3|Mike|      Sales| 65000|
|  2|Jane|  Marketing| 80000|
+---+----+-----------+------+

+---+----+-----------+------+
| id|name| department|salary|
+---+----+-----------+------+
|  1|John|Engineering| 70000|
|  3|Mike|      Sales| 65000|
|  2|Jane|  Marketing| 80000|
+---+----+-----------+------+

+---+----+-----------+------+
| id|name| department|salary|
+---+----+-----------+------+
|  1|John|Engineering| 70000|
|  2|Jane|  Marketing| 80000|
|  3|Mike|      Sales| 65000|
+---+----+-----------+------+



7. Handling Missing Values

In [None]:
# Create DataFrame with null values
data_with_nulls = [
    ("John", 25, "New York"),
    ("Jane", None, "San Francisco"),
    ("Mike", 35, None),
    (None, 40, "Boston")
]
df_nulls = spark.createDataFrame(data_with_nulls, ["name", "age", "city"])

# Drop rows with any null values
df_nulls.na.drop().show()

# Drop rows with null values in specific columns
df_nulls.na.drop(subset=["name"]).show()

# Fill null values
df_nulls.na.fill({"age": 0, "name": "Unknown", "city": "Unknown"}).show()

+----+---+--------+
|name|age|    city|
+----+---+--------+
|John| 25|New York|
+----+---+--------+

+----+----+-------------+
|name| age|         city|
+----+----+-------------+
|John|  25|     New York|
|Jane|NULL|San Francisco|
|Mike|  35|         NULL|
+----+----+-------------+

+-------+---+-------------+
|   name|age|         city|
+-------+---+-------------+
|   John| 25|     New York|
|   Jane|  0|San Francisco|
|   Mike| 35|      Unknown|
|Unknown| 40|       Boston|
+-------+---+-------------+



8. User Defined Functions

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

df.show()

# Define a UDF
def age_category(age):
    if age is None:
        return "Unknown"
    elif age < 30:
        return "Young"
    else:
        return "Senior"

# Register UDF
age_category_udf = udf(age_category, StringType())

# Apply UDF
df.withColumn("age_category", age_category_udf(df.years_old)).show()

+----+---------+-------------+
|name|years_old|         city|
+----+---------+-------------+
|John|       25|     New York|
|Jane|       30|San Francisco|
|Mike|       35|      Chicago|
+----+---------+-------------+

+----+---------+-------------+------------+
|name|years_old|         city|age_category|
+----+---------+-------------+------------+
|John|       25|     New York|       Young|
|Jane|       30|San Francisco|      Senior|
|Mike|       35|      Chicago|      Senior|
+----+---------+-------------+------------+



8. Write DataFrames to files


```
# Save as CSV
df.write.csv("path/to/output/csv", header=True)

# Save as JSON
df.write.json("path/to/output/json")

# Save as Parquet
df.write.parquet("path/to/output/parquet")

# Save to a database
df.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/database") \
    .option("dbtable", "output_table") \
    .option("user", "username") \
    .option("password", "password") \
    .mode("overwrite") \
    .save()
```



#Resources For DataFrames
https://sparkbyexamples.com/pyspark-tutorial/