In [None]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf

# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

## `pyspark.sql.functions` functions

`pyspark.sql.functions` is collection of built-in functions for **creating column expressions**. These functions largely increase methods that we can use to manipulate DataFrame and DataFrame columns.

There are many sql functions from the `pyspark.sql.functions` module. Here I only choose a few to show how these functions extend the ability to create column expressions.

In [None]:
from pyspark.sql import functions as F

## `abs()`: create column expression that returns absolute values of a column

In [None]:
from pyspark.sql import Row
df = sc.parallelize([Row(x=1), Row(x=-1), Row(x=-2)]).toDF()
df.show()

In [None]:
x_abs = F.abs(df.x)
x_abs

In [None]:
df.select(df.x, x_abs).show()

## `concat()`: create column expression that concatenates multiple column values into one 

In [None]:
df = sc.parallelize([Row(a='apple', b='tree'), Row(a='orange', b='flowers')]).toDF()
df.show()

In [None]:
ab_concat = F.concat(df.a, df.b)
ab_concat

In [None]:
df.select(df.a, df.b, ab_concat).show()

## `corr()`: create column expression that returns pearson correlation coefficient between two columns

In [None]:
mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)
mtcars.show(5)

In [None]:
drat_wt_corr = F.corr(mtcars.drat, mtcars.wt)
drat_wt_corr

In [None]:
mtcars.select(drat_wt_corr).show()

## `array()`: create column expression that merge multiple column values into an array

This function can be used to build **feature column** in machine learning models.

In [None]:
cols = [eval('mtcars.' + col) for col in mtcars.columns[1:]]
cols

In [None]:
cols_array = F.array(cols)
cols_array

In [None]:
mtcars.select(cols_array).show(truncate=False)