In [1]:
# create pyspark session
from pyspark.sql import SparkSession

spark=SparkSession.builder \
    .appName("FirstApp")\
        .getOrCreate()

print(spark.version)

4.1.1


In [2]:
# creating dataframes from a list
data = [("Tarun", 25), ("Ganesh", 22), ("Sai", 30)]
columns=["Name", "Age"]

df = spark.createDataFrame(data, columns)
df.show()

+------+---+
|  Name|Age|
+------+---+
| Tarun| 25|
|Ganesh| 22|
|   Sai| 30|
+------+---+



In [None]:
# open from csv file
df_csv = spark.read.csv("people.csv", header=True, inferSchema=True) # header=True → first row is column names
# inferSchema=True → automatically detect data types
df_csv.show()

###  Basic DataFrame Operations

In [5]:
data = [("Tarun", 25), ("Ganesh", 22), ("Sai", 30)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
df.show()

+------+---+
|  Name|Age|
+------+---+
| Tarun| 25|
|Ganesh| 22|
|   Sai| 30|
+------+---+



In [6]:
# select columns
df.select("Name").show()

+------+
|  Name|
+------+
| Tarun|
|Ganesh|
|   Sai|
+------+



In [7]:
# filter rows
df.filter(df.Age>23).show()

+-----+---+
| Name|Age|
+-----+---+
|Tarun| 25|
|  Sai| 30|
+-----+---+



In [8]:
# add new column
from pyspark.sql.functions import col
df=df.withColumn("Age after 5 years", col("Age")+5)
df.show()

+------+---+-----------------+
|  Name|Age|Age after 5 years|
+------+---+-----------------+
| Tarun| 25|               30|
|Ganesh| 22|               27|
|   Sai| 30|               35|
+------+---+-----------------+



In [9]:
# Aggregate / GroupBy
df.groupBy().avg("Age").show()

+------------------+
|          avg(Age)|
+------------------+
|25.666666666666668|
+------------------+

