In [1]:
import findspark
findspark.init()

In [7]:

"""
Since Spark 2.x, Spark unified Spark APIs, DF, Datasets, & SQL.
SparkSession uses SparkContext internally.
"""

from pyspark.sql import SparkSession
ss = SparkSession.builder.master("local").appName("sparkDataFrame").getOrCreate()

ss

In [9]:
"""
Get SparkContext associated with SparkSession
"""
sc = ss.sparkContext
sc

In [11]:
rdd = sc.parallelize(range(1, 11))
rdd.min(), rdd.max(), rdd.mean()

(1, 10, 5.5)

In [15]:
end = '\n'*2

"""
RDD DataFrame
RDD DataFrame is built over the underlying RDD
"""
data = (("A", "F", 1000), ("B", "M", 1500), ("C", "F", 2000))
columns = ("Name", "Gender", "Salary")

emp_df = ss.createDataFrame(data=data, schema=columns)

print("-- Schema --")
emp_df.printSchema()

print(end=end)

print("-- Data --")
emp_df.show()  # Default: First 20


"""
What's the underlying RDD?
"""
print("-- Underlying RDD --")
emp_df.rdd.collect()

-- Schema --
root
 |-- Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: long (nullable = true)



-- Data --
+----+------+------+
|Name|Gender|Salary|
+----+------+------+
|   A|     F|  1000|
|   B|     M|  1500|
|   C|     F|  2000|
+----+------+------+

-- Underlying RDD --


[Row(Name='A', Gender='F', Salary=1000),
 Row(Name='B', Gender='M', Salary=1500),
 Row(Name='C', Gender='F', Salary=2000)]

In [16]:

"""
Using RDD APIs on an RDD DF
Most of the time, this won't be necessary as RDD DFs APIs can be leveraged.b
"""
emp_df.rdd.filter(lambda r: r['Gender'] == 'F').collect()

[Row(Name='A', Gender='F', Salary=1000),
 Row(Name='C', Gender='F', Salary=2000)]

In [17]:

"""
Retrieve DF partitions
"""
emp_df.rdd.getNumPartitions()

1

In [22]:

"""
Use DF API to filter data (insted of the RDD API)
DFs are immutable like their underlying RDDs.
"""
emp_1001 = emp_df.filter("Salary < 1001")
emp_1001.show()

+----+------+------+
|Name|Gender|Salary|
+----+------+------+
|   A|     F|  1000|
+----+------+------+



In [24]:
emp_1001.describe()

DataFrame[summary: string, Name: string, Gender: string, Salary: string]

In [25]:
emp_1001.columns

['Name', 'Gender', 'Salary']

In [33]:
help(ss.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data, schema=None, samplingRatio=None, verifySchema=True) method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
    
    When ``schema`` is a list of column names, the type of each column
    will be inferred from ``data``.
    
    When ``schema`` is ``None``, it will try to infer the schema (column names and types)
    from ``data``, which should be an RDD of either :class:`Row`,
    :class:`namedtuple`, or :class:`dict`.
    
    When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
    the real data, or an exception will be thrown at runtime. If the given schema is not
    :class:`pyspark.sql.types.StructType`, it will be wrapped into a
    :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value".
    Each record will also be wrapped into a tu