In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-pyspark-sql-api")\
    .getOrCreate()

In [3]:
df = spark.createDataFrame([("a", "Y", 1), ("b", "Y", 2), ("c", "N",  3)], ["Col1", "Col1A", "Col2"])
df.select(df.colRegex("`(Col1)?+.+`")).show()

+-----+----+
|Col1A|Col2|
+-----+----+
|    Y|   1|
|    Y|   2|
|    N|   3|
+-----+----+



https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.transform.html#pyspark.sql.DataFrame.transform

In [4]:
from pyspark.sql.functions import col
df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
def cast_all_to_int(input_df):
    return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
def sort_columns_asc(input_df):
    return input_df.select(*sorted(input_df.columns))
df.transform(cast_all_to_int).transform(sort_columns_asc).show()

+-----+---+
|float|int|
+-----+---+
|    1|  1|
|    2|  2|
+-----+---+



In [5]:
df.schema

StructType(List(StructField(int,LongType,true),StructField(float,DoubleType,true)))

In [6]:
df.schema.fields

[StructField(int,LongType,true), StructField(float,DoubleType,true)]

In [7]:
df.schema.fields[0]

StructField(int,LongType,true)

In [8]:
df.schema.fields[0].dataType

LongType

In [9]:
df.schema.fields[0].name

'int'

In [11]:
df.dtypes

[('int', 'bigint'), ('float', 'double')]

Row 
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.Row.asDict.html#pyspark.sql.Row.asDict

In [12]:
from pyspark.sql import Row

In [13]:
Row(name="Alice", age=11).asDict() == {'name': 'Alice', 'age': 11}

True

In [14]:
row = Row(key=1, value=Row(name='a', age=2))
row.asDict() == {'key': 1, 'value': Row(name='a', age=2)}

True

In [15]:
row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}}

True

### Functions

https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#functions