In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]")\
                            .appName("DFApp").getOrCreate()

In [3]:
# to get the spark context associated with Spark Session
sc = spark.sparkContext

In [5]:
rdd = sc.parallelize([10,20,30,40])
print(rdd.min())
print(rdd.max())

10
40


In [None]:
def add(a, b): 
    return a + b

add(10, 20)
add(b = 20, a = 10)
a = 10
b = 20
add(b = b, a = a) # b = 20, a = 10

In [6]:
ds = [ 
        ('Joe', 'M', 6000),
        ('Mary', 'F', 8000)
]

# structured data { columns, data-types}
columns = ['name', 'gender', 'salary']
# create a dataframe, using data defined above 
df = spark.createDataFrame(data=ds, schema=columns)

In [7]:
# behind every data frame, there is RDD
# Data Frame is an API, RDD is low level abstraction behind data frame
df.rdd.collect()

[Row(name='Joe', gender='M', salary=6000),
 Row(name='Mary', gender='F', salary=8000)]

In [8]:
# spark shall use RDD operations internally when we use DF API
# exmaple to use DF RDD directly
df.rdd.filter (lambda row: row['salary']> 7000).collect()

[Row(name='Mary', gender='F', salary=8000)]

In [9]:
# get the partitions of DF
print("Partitions ", df.rdd.getNumPartitions())

Partitions  1


In [11]:
# Use DF API to filter the data
# DF is immutable
# filter returns new data frame
df2 = df.filter (' salary > 7000 ')

df2.show()

# df records remains same, no change since data frame is immutable
df.show()

+----+------+------+
|name|gender|salary|
+----+------+------+
|Mary|     F|  8000|
+----+------+------+

+----+------+------+
|name|gender|salary|
+----+------+------+
| Joe|     M|  6000|
|Mary|     F|  8000|
+----+------+------+



In [12]:
df2.rdd.collect()

[Row(name='Mary', gender='F', salary=8000)]