In [10]:
# Importing the packages
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as func

In [11]:
# Creating the SparkSession
spark = SparkSession.builder.appName("FirstApp").getOrCreate()

In [12]:
# Defining schema for your DataFrame
mySchema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("name", StringrType(), True),
    StructField("age", IntegerType(), True),
    StructField("friends", IntegerType(), True),
])

In [13]:
# Creating DataFrame on a CSV file
people = spark.read.format("csv")\
.schema(mySchema)\
.option("path", "fakefriends.csv")\
.load()

In [14]:
# Performing all the transformations
output = people.select(people.userID, people.name, people.age, people.friends)\
.where(people.age < 30).withColumn('insert_ts', func.current_timestamp())\
.orderBy(people.userID)

In [15]:
# taking the count of o/p DataFrame
output.count()

112

In [16]:
# Creating a Temp View
output.createOrReplaceTempView("peoples")

In [17]:
# Running a simple Spark SQL query
spark.sql("select name, age, friends, insert_ts from peoples").show()

+----+---+-------+--------------------+
|name|age|friends|           insert_ts|
+----+---+-------+--------------------+
|NULL| 26|      2|2024-02-15 21:14:...|
|NULL| 27|    181|2024-02-15 21:14:...|
|NULL| 22|    323|2024-02-15 21:14:...|
|NULL| 19|    268|2024-02-15 21:14:...|
|NULL| 25|      1|2024-02-15 21:14:...|
|NULL| 21|    445|2024-02-15 21:14:...|
|NULL| 22|    100|2024-02-15 21:14:...|
|NULL| 26|    281|2024-02-15 21:14:...|
|NULL| 27|    305|2024-02-15 21:14:...|
|NULL| 25|     96|2024-02-15 21:14:...|
|NULL| 24|     49|2024-02-15 21:14:...|
|NULL| 20|      1|2024-02-15 21:14:...|
|NULL| 19|    269|2024-02-15 21:14:...|
|NULL| 19|      5|2024-02-15 21:14:...|
|NULL| 20|    100|2024-02-15 21:14:...|
|NULL| 21|    477|2024-02-15 21:14:...|
|NULL| 22|    179|2024-02-15 21:14:...|
|NULL| 20|    384|2024-02-15 21:14:...|
|NULL| 28|    311|2024-02-15 21:14:...|
|NULL| 24|    492|2024-02-15 21:14:...|
+----+---+-------+--------------------+
only showing top 20 rows

