## Import Functions

In [0]:
from pyspark.sql.functions import col, column, concat, lit

## Creating and displaying a dataframe

In [0]:
df = spark.read.format("csv").option("header", True).load("/FileStore/tables/Auto-mpg/auto_mpg.csv")

In [0]:
display(df)

mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
17.0,8,302.0,140,3449,10.5,70,1,ford torino
15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl


## Getting Schema Information

In [0]:
df.printSchema()

root
 |-- mpg: string (nullable = true)
 |-- cylinders: string (nullable = true)
 |-- displacement: string (nullable = true)
 |-- horsepower: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- acceleration: string (nullable = true)
 |-- model year: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- car name: string (nullable = true)



In [0]:
df.columns

Out[5]: ['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'model year',
 'origin',
 'car name']

In [0]:
df.dtypes

Out[6]: [('mpg', 'string'),
 ('cylinders', 'string'),
 ('displacement', 'string'),
 ('horsepower', 'string'),
 ('weight', 'string'),
 ('acceleration', 'string'),
 ('model year', 'string'),
 ('origin', 'string'),
 ('car name', 'string')]

In [0]:
df.schema

Out[7]: StructType(List(StructField(mpg,StringType,true),StructField(cylinders,StringType,true),StructField(displacement,StringType,true),StructField(horsepower,StringType,true),StructField(weight,StringType,true),StructField(acceleration,StringType,true),StructField(model year,StringType,true),StructField(origin,StringType,true),StructField(car name,StringType,true)))

## Function documentation

In [0]:
help(df.select)

Help on method select in module pyspark.sql.dataframe:

select(*cols) method of pyspark.sql.dataframe.DataFrame instance
    Projects a set of expressions and returns a new :class:`DataFrame`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : str, :class:`Column`, or list
        column names (string) or expressions (:class:`Column`).
        If one of the column names is '*', that column is expanded to include all columns
        in the current :class:`DataFrame`.
    
    Examples
    --------
    >>> df.select('*').collect()
    [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
    >>> df.select('name', 'age').collect()
    [Row(name='Alice', age=2), Row(name='Bob', age=5)]
    >>> df.select(df.name, (df.age + 10).alias('age')).collect()
    [Row(name='Alice', age=12), Row(name='Bob', age=15)]



In [0]:
help(df.selectExpr)

Help on method selectExpr in module pyspark.sql.dataframe:

selectExpr(*expr) method of pyspark.sql.dataframe.DataFrame instance
    Projects a set of SQL expressions and returns a new :class:`DataFrame`.
    
    This is a variant of :func:`select` that accepts SQL expressions.
    
    .. versionadded:: 1.3.0
    
    Examples
    --------
    >>> df.selectExpr("age * 2", "abs(age)").collect()
    [Row((age * 2)=4, abs(age)=2), Row((age * 2)=10, abs(age)=5)]



In [0]:
help(col)

Help on function col in module pyspark.sql.functions:

col(col)
    Returns a :class:`~pyspark.sql.Column` based on the given column name.'
    Examples
    --------
    >>> col('x')
    Column<'x'>
    >>> column('x')
    Column<'x'>
    
    .. versionadded:: 1.3



## Select

In [0]:
df.select("*").show() #Selecting all data

+---+---------+------------+----------+------+------------+----------+------+--------------------+
|mpg|cylinders|displacement|horsepower|weight|acceleration|model year|origin|            car name|
+---+---------+------------+----------+------+------------+----------+------+--------------------+
| 18|        8|         307|       130|  3504|          12|        70|     1|chevrolet chevell...|
| 15|        8|         350|       165|  3693|        11.5|        70|     1|   buick skylark 320|
| 18|        8|         318|       150|  3436|          11|        70|     1|  plymouth satellite|
| 16|        8|         304|       150|  3433|          12|        70|     1|       amc rebel sst|
| 17|        8|         302|       140|  3449|        10.5|        70|     1|         ford torino|
| 15|        8|         429|       198|  4341|          10|        70|     1|    ford galaxie 500|
| 14|        8|         454|       220|  4354|           9|        70|     1|    chevrolet impala|
| 14|     

In [0]:
df.select(col("mpg")).show()

+---+
|mpg|
+---+
| 18|
| 15|
| 18|
| 16|
| 17|
| 15|
| 14|
| 14|
| 14|
| 15|
| 15|
| 14|
| 15|
| 14|
| 24|
| 22|
| 18|
| 21|
| 27|
| 26|
+---+
only showing top 20 rows



In [0]:
df.select(col("mpg"),"car name").show() #Combination of column and string

+---+--------------------+
|mpg|            car name|
+---+--------------------+
| 18|chevrolet chevell...|
| 15|   buick skylark 320|
| 18|  plymouth satellite|
| 16|       amc rebel sst|
| 17|         ford torino|
| 15|    ford galaxie 500|
| 14|    chevrolet impala|
| 14|   plymouth fury iii|
| 14|    pontiac catalina|
| 15|  amc ambassador dpl|
| 15| dodge challenger se|
| 14|  plymouth 'cuda 340|
| 15|chevrolet monte c...|
| 14|buick estate wago...|
| 24|toyota corona mar...|
| 22|     plymouth duster|
| 18|          amc hornet|
| 21|       ford maverick|
| 27|        datsun pl510|
| 26|volkswagen 1131 d...|
+---+--------------------+
only showing top 20 rows



In [0]:
df.select(df["mpg"],"car name").show() #using [] to get column and use it with select

+---+--------------------+
|mpg|            car name|
+---+--------------------+
| 18|chevrolet chevell...|
| 15|   buick skylark 320|
| 18|  plymouth satellite|
| 16|       amc rebel sst|
| 17|         ford torino|
| 15|    ford galaxie 500|
| 14|    chevrolet impala|
| 14|   plymouth fury iii|
| 14|    pontiac catalina|
| 15|  amc ambassador dpl|
| 15| dodge challenger se|
| 14|  plymouth 'cuda 340|
| 15|chevrolet monte c...|
| 14|buick estate wago...|
| 24|toyota corona mar...|
| 22|     plymouth duster|
| 18|          amc hornet|
| 21|       ford maverick|
| 27|        datsun pl510|
| 26|volkswagen 1131 d...|
+---+--------------------+
only showing top 20 rows



In [0]:
df.alias("my_alias").select("my_alias.car name", "my_alias.mpg").show() #Using alias.

+--------------------+---+
|            car name|mpg|
+--------------------+---+
|chevrolet chevell...| 18|
|   buick skylark 320| 15|
|  plymouth satellite| 18|
|       amc rebel sst| 16|
|         ford torino| 17|
|    ford galaxie 500| 15|
|    chevrolet impala| 14|
|   plymouth fury iii| 14|
|    pontiac catalina| 14|
|  amc ambassador dpl| 15|
| dodge challenger se| 15|
|  plymouth 'cuda 340| 14|
|chevrolet monte c...| 15|
|buick estate wago...| 14|
|toyota corona mar...| 24|
|     plymouth duster| 22|
|          amc hornet| 18|
|       ford maverick| 21|
|        datsun pl510| 27|
|volkswagen 1131 d...| 26|
+--------------------+---+
only showing top 20 rows



In [0]:
df.select("model year","origin","car name").show() #Comma separated string

+----------+------+--------------------+
|model year|origin|            car name|
+----------+------+--------------------+
|        70|     1|chevrolet chevell...|
|        70|     1|   buick skylark 320|
|        70|     1|  plymouth satellite|
|        70|     1|       amc rebel sst|
|        70|     1|         ford torino|
|        70|     1|    ford galaxie 500|
|        70|     1|    chevrolet impala|
|        70|     1|   plymouth fury iii|
|        70|     1|    pontiac catalina|
|        70|     1|  amc ambassador dpl|
|        70|     1| dodge challenger se|
|        70|     1|  plymouth 'cuda 340|
|        70|     1|chevrolet monte c...|
|        70|     1|buick estate wago...|
|        70|     3|toyota corona mar...|
|        70|     1|     plymouth duster|
|        70|     1|          amc hornet|
|        70|     1|       ford maverick|
|        70|     3|        datsun pl510|
|        70|     2|volkswagen 1131 d...|
+----------+------+--------------------+
only showing top

In [0]:
df.select(["model year","origin","car name"]).show() #list of string

+----------+------+--------------------+
|model year|origin|            car name|
+----------+------+--------------------+
|        70|     1|chevrolet chevell...|
|        70|     1|   buick skylark 320|
|        70|     1|  plymouth satellite|
|        70|     1|       amc rebel sst|
|        70|     1|         ford torino|
|        70|     1|    ford galaxie 500|
|        70|     1|    chevrolet impala|
|        70|     1|   plymouth fury iii|
|        70|     1|    pontiac catalina|
|        70|     1|  amc ambassador dpl|
|        70|     1| dodge challenger se|
|        70|     1|  plymouth 'cuda 340|
|        70|     1|chevrolet monte c...|
|        70|     1|buick estate wago...|
|        70|     3|toyota corona mar...|
|        70|     1|     plymouth duster|
|        70|     1|          amc hornet|
|        70|     1|       ford maverick|
|        70|     3|        datsun pl510|
|        70|     2|volkswagen 1131 d...|
+----------+------+--------------------+
only showing top

In [0]:
df.select([col("car name"),col("model year").alias("year"),col("origin")]).show() #list of columns and using column alias

+--------------------+----+------+
|            car name|year|origin|
+--------------------+----+------+
|chevrolet chevell...|  70|     1|
|   buick skylark 320|  70|     1|
|  plymouth satellite|  70|     1|
|       amc rebel sst|  70|     1|
|         ford torino|  70|     1|
|    ford galaxie 500|  70|     1|
|    chevrolet impala|  70|     1|
|   plymouth fury iii|  70|     1|
|    pontiac catalina|  70|     1|
|  amc ambassador dpl|  70|     1|
| dodge challenger se|  70|     1|
|  plymouth 'cuda 340|  70|     1|
|chevrolet monte c...|  70|     1|
|buick estate wago...|  70|     1|
|toyota corona mar...|  70|     3|
|     plymouth duster|  70|     1|
|          amc hornet|  70|     1|
|       ford maverick|  70|     1|
|        datsun pl510|  70|     3|
|volkswagen 1131 d...|  70|     2|
+--------------------+----+------+
only showing top 20 rows



In [0]:
desired_cols = ["car name","model year","mpg"] #using *args

In [0]:
df.select(*desired_cols).show()

+--------------------+----------+---+
|            car name|model year|mpg|
+--------------------+----------+---+
|chevrolet chevell...|        70| 18|
|   buick skylark 320|        70| 15|
|  plymouth satellite|        70| 18|
|       amc rebel sst|        70| 16|
|         ford torino|        70| 17|
|    ford galaxie 500|        70| 15|
|    chevrolet impala|        70| 14|
|   plymouth fury iii|        70| 14|
|    pontiac catalina|        70| 14|
|  amc ambassador dpl|        70| 15|
| dodge challenger se|        70| 15|
|  plymouth 'cuda 340|        70| 14|
|chevrolet monte c...|        70| 15|
|buick estate wago...|        70| 14|
|toyota corona mar...|        70| 24|
|     plymouth duster|        70| 22|
|          amc hornet|        70| 18|
|       ford maverick|        70| 21|
|        datsun pl510|        70| 27|
|volkswagen 1131 d...|        70| 26|
+--------------------+----------+---+
only showing top 20 rows



In [0]:
df.select("car name","model year", concat(col("car name"),lit("-"),col("model year")).alias("concat_col")).show() #using concat function

+--------------------+----------+--------------------+
|            car name|model year|          concat_col|
+--------------------+----------+--------------------+
|chevrolet chevell...|        70|chevrolet chevell...|
|   buick skylark 320|        70|buick skylark 320-70|
|  plymouth satellite|        70|plymouth satellit...|
|       amc rebel sst|        70|    amc rebel sst-70|
|         ford torino|        70|      ford torino-70|
|    ford galaxie 500|        70| ford galaxie 500-70|
|    chevrolet impala|        70| chevrolet impala-70|
|   plymouth fury iii|        70|plymouth fury iii-70|
|    pontiac catalina|        70| pontiac catalina-70|
|  amc ambassador dpl|        70|amc ambassador dp...|
| dodge challenger se|        70|dodge challenger ...|
|  plymouth 'cuda 340|        70|plymouth 'cuda 34...|
|chevrolet monte c...|        70|chevrolet monte c...|
|buick estate wago...|        70|buick estate wago...|
|toyota corona mar...|        70|toyota corona mar...|
|     plym

##selectExpr
- major difference is that, selectExpr is similar to SQL while select is not

In [0]:
"""
Important thing to notice here is that, column names with space will not work as expected with selectExpr. 
"""
df.selectExpr('mpg','cylinders').show()

+---+---------+
|mpg|cylinders|
+---+---------+
| 18|        8|
| 15|        8|
| 18|        8|
| 16|        8|
| 17|        8|
| 15|        8|
| 14|        8|
| 14|        8|
| 14|        8|
| 15|        8|
| 15|        8|
| 14|        8|
| 15|        8|
| 14|        8|
| 24|        4|
| 22|        6|
| 18|        6|
| 21|        6|
| 27|        4|
| 26|        4|
+---+---------+
only showing top 20 rows



In [0]:
df.selectExpr("mpg","model year").show() #Expected error

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-2882727871093078>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m [0mdf[0m[0;34m.[0m[0mselectExpr[0m[0;34m([0m[0;34m"mpg"[0m[0;34m,[0m[0;34m"model year"[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m [0;31m#Expected error[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/sql/dataframe.py[0m in [0;36mselectExpr[0;34m(self, *expr)[0m
[1;32m   1841[0m         [0;32mif[0m [0mlen[0m[0;34m([0m[0mexpr[0m[0;34m)[0m [0;34m==[0m [0;36m1[0m [0;32mand[0m [0misinstance[0m[0;34m([0m[0mexpr[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mlist[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1842[0m             [0mexpr[0m [0;34m=[0m [0mexpr[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m

In [0]:
df.selectExpr("mpg","`model year`").show() #Use ` to resolve the error

+---+----------+
|mpg|model year|
+---+----------+
| 18|        70|
| 15|        70|
| 18|        70|
| 16|        70|
| 17|        70|
| 15|        70|
| 14|        70|
| 14|        70|
| 14|        70|
| 15|        70|
| 15|        70|
| 14|        70|
| 15|        70|
| 14|        70|
| 24|        70|
| 22|        70|
| 18|        70|
| 21|        70|
| 27|        70|
| 26|        70|
+---+----------+
only showing top 20 rows



In [0]:
df.selectExpr(col("mpg"),"model year").show() #Expected error - col can not be used with selectExpr

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-812323868429063>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m [0mdf[0m[0;34m.[0m[0mselectExpr[0m[0;34m([0m[0mcol[0m[0;34m([0m[0;34m"mpg"[0m[0;34m)[0m[0;34m,[0m[0;34m"model year"[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m [0;31m#Expected error - col can not be used with selectExpr[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/sql/dataframe.py[0m in [0;36mselectExpr[0;34m(self, *expr)[0m
[1;32m   1841[0m         [0;32mif[0m [0mlen[0m[0;34m([0m[0mexpr[0m[0;34m)[0m [0;34m==[0m [0;36m1[0m [0;32mand[0m [0misinstance[0m[0;34m([0m[0mexpr[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mlist[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1842[0m             [0mexpr[0m [0;34m

In [0]:
df.selectExpr(df["mpg"],"model year").show() #Expected error - col can not be used with selectExpr

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-812323868429064>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m [0mdf[0m[0;34m.[0m[0mselectExpr[0m[0;34m([0m[0mdf[0m[0;34m[[0m[0;34m"mpg"[0m[0;34m][0m[0;34m,[0m[0;34m"model year"[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m [0;31m#Expected error - col can not be used with selectExpr[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/sql/dataframe.py[0m in [0;36mselectExpr[0;34m(self, *expr)[0m
[1;32m   1841[0m         [0;32mif[0m [0mlen[0m[0;34m([0m[0mexpr[0m[0;34m)[0m [0;34m==[0m [0;36m1[0m [0;32mand[0m [0misinstance[0m[0;34m([0m[0mexpr[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mlist[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1842[0m             [0mexpr[0m [0;34m=

In [0]:
df.selectExpr("`car name`","`model year`","concat(`car name`,'-',`model year`) as concat_col").show()

+--------------------+----------+--------------------+
|            car name|model year|          concat_col|
+--------------------+----------+--------------------+
|chevrolet chevell...|        70|chevrolet chevell...|
|   buick skylark 320|        70|buick skylark 320-70|
|  plymouth satellite|        70|plymouth satellit...|
|       amc rebel sst|        70|    amc rebel sst-70|
|         ford torino|        70|      ford torino-70|
|    ford galaxie 500|        70| ford galaxie 500-70|
|    chevrolet impala|        70| chevrolet impala-70|
|   plymouth fury iii|        70|plymouth fury iii-70|
|    pontiac catalina|        70| pontiac catalina-70|
|  amc ambassador dpl|        70|amc ambassador dp...|
| dodge challenger se|        70|dodge challenger ...|
|  plymouth 'cuda 340|        70|plymouth 'cuda 34...|
|chevrolet monte c...|        70|chevrolet monte c...|
|buick estate wago...|        70|buick estate wago...|
|toyota corona mar...|        70|toyota corona mar...|
|     plym

In [0]:
df.alias("my_df").selectExpr("my_df.`car name`","my_df.`model year`","concat(my_df.`car name`,'-',my_df.`model year`) as concat_col").show() #Using alias with selectExpr

+--------------------+----------+--------------------+
|            car name|model year|          concat_col|
+--------------------+----------+--------------------+
|chevrolet chevell...|        70|chevrolet chevell...|
|   buick skylark 320|        70|buick skylark 320-70|
|  plymouth satellite|        70|plymouth satellit...|
|       amc rebel sst|        70|    amc rebel sst-70|
|         ford torino|        70|      ford torino-70|
|    ford galaxie 500|        70| ford galaxie 500-70|
|    chevrolet impala|        70| chevrolet impala-70|
|   plymouth fury iii|        70|plymouth fury iii-70|
|    pontiac catalina|        70| pontiac catalina-70|
|  amc ambassador dpl|        70|amc ambassador dp...|
| dodge challenger se|        70|dodge challenger ...|
|  plymouth 'cuda 340|        70|plymouth 'cuda 34...|
|chevrolet monte c...|        70|chevrolet monte c...|
|buick estate wago...|        70|buick estate wago...|
|toyota corona mar...|        70|toyota corona mar...|
|     plym

## Using sql

In [0]:
df.createOrReplaceTempView("auto_mpg")

In [0]:
spark.sql("""
SELECT
`car name`, 
`model year`,
concat(`car name`,'-',`model year`) as concat_col
FROM
auto_mpg
;
""").show()

+--------------------+----------+--------------------+
|            car name|model year|          concat_col|
+--------------------+----------+--------------------+
|chevrolet chevell...|        70|chevrolet chevell...|
|   buick skylark 320|        70|buick skylark 320-70|
|  plymouth satellite|        70|plymouth satellit...|
|       amc rebel sst|        70|    amc rebel sst-70|
|         ford torino|        70|      ford torino-70|
|    ford galaxie 500|        70| ford galaxie 500-70|
|    chevrolet impala|        70| chevrolet impala-70|
|   plymouth fury iii|        70|plymouth fury iii-70|
|    pontiac catalina|        70| pontiac catalina-70|
|  amc ambassador dpl|        70|amc ambassador dp...|
| dodge challenger se|        70|dodge challenger ...|
|  plymouth 'cuda 340|        70|plymouth 'cuda 34...|
|chevrolet monte c...|        70|chevrolet monte c...|
|buick estate wago...|        70|buick estate wago...|
|toyota corona mar...|        70|toyota corona mar...|
|     plym

## Columns

## Official Documentation Link
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Column.html

In [0]:
display(df)

mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
17.0,8,302.0,140,3449,10.5,70,1,ford torino
15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl


In [0]:
df.printSchema()

root
 |-- mpg: string (nullable = true)
 |-- cylinders: string (nullable = true)
 |-- displacement: string (nullable = true)
 |-- horsepower: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- acceleration: string (nullable = true)
 |-- model year: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- car name: string (nullable = true)



In [0]:
df.select(col("mpg").cast("int"), col("cylinders"), "origin").printSchema() #Casting a columns. mpg changed to int

root
 |-- mpg: integer (nullable = true)
 |-- cylinders: string (nullable = true)
 |-- origin: string (nullable = true)

