In [0]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [("Utsav","Singh","India","Uttar Pradesh"),
        ("Sourav","Goyal","Dubai","UAE"),
        ("Akash","Dua","India","Uttar Pradesh"),
        ("Manisha","Sreepada","India","Hyderabad"),
        ("Vaishnavi","Choudhary","India","Bengaluru")]
columns = ["FirstName","LastName","Country","State"]
df = spark.createDataFrame(data=data, schema=columns)
df = spark.createDataFrame(data = data, schema = columns)
df.show(truncate=False)

+---------+---------+-------+-------------+
|FirstName|LastName |Country|State        |
+---------+---------+-------+-------------+
|Utsav    |Singh    |India  |Uttar Pradesh|
|Sourav   |Goyal    |Dubai  |UAE          |
|Akash    |Dua      |India  |Uttar Pradesh|
|Manisha  |Sreepada |India  |Hyderabad    |
|Vaishnavi|Choudhary|India  |Bengaluru    |
+---------+---------+-------+-------------+



In [0]:
from pyspark.sql.functions import *
df.select("firstname","lastname").show()
df.select(df.FirstName,df.LastName).show()
df.select(df["firstname"],df["lastname"]).show()

#By using col() function
from pyspark.sql.functions import col
df.select(col("firstname"),col("lastname")).show()

#Select columns by regular expression
df.select(df.colRegex("`^.*name*`")).show()

+---------+---------+
|firstname| lastname|
+---------+---------+
|    Utsav|    Singh|
|   Sourav|    Goyal|
|    Akash|      Dua|
|  Manisha| Sreepada|
|Vaishnavi|Choudhary|
+---------+---------+

+---------+---------+
|FirstName| LastName|
+---------+---------+
|    Utsav|    Singh|
|   Sourav|    Goyal|
|    Akash|      Dua|
|  Manisha| Sreepada|
|Vaishnavi|Choudhary|
+---------+---------+

+---------+---------+
|firstname| lastname|
+---------+---------+
|    Utsav|    Singh|
|   Sourav|    Goyal|
|    Akash|      Dua|
|  Manisha| Sreepada|
|Vaishnavi|Choudhary|
+---------+---------+

+---------+---------+
|firstname| lastname|
+---------+---------+
|    Utsav|    Singh|
|   Sourav|    Goyal|
|    Akash|      Dua|
|  Manisha| Sreepada|
|Vaishnavi|Choudhary|
+---------+---------+

+---------+---------+
|FirstName| LastName|
+---------+---------+
|    Utsav|    Singh|
|   Sourav|    Goyal|
|    Akash|      Dua|
|  Manisha| Sreepada|
|Vaishnavi|Choudhary|
+---------+---------+



In [0]:
df.select("*").show()

+---------+---------+-------+-------------+
|FirstName| LastName|Country|        State|
+---------+---------+-------+-------------+
|    Utsav|    Singh|  India|Uttar Pradesh|
|   Sourav|    Goyal|  Dubai|          UAE|
|    Akash|      Dua|  India|Uttar Pradesh|
|  Manisha| Sreepada|  India|    Hyderabad|
|Vaishnavi|Choudhary|  India|    Bengaluru|
+---------+---------+-------+-------------+



In [0]:
df.select(df.columns[2:4]).show(3)

+-------+-------------+
|Country|        State|
+-------+-------------+
|  India|Uttar Pradesh|
|  Dubai|          UAE|
|  India|Uttar Pradesh|
+-------+-------------+
only showing top 3 rows



In [0]:
data = [
        (("James",None,"Smith"),"OH","M"),
        (("Anna","Rose",""),"NY","F"),
        (("Julia","","Williams"),"OH","F"),
        (("Maria","Anne","Jones"),"NY","M"),
        (("Jen","Mary","Brown"),"NY","M"),
        (("Mike","Mary","Williams"),"OH","M")
        ]

from pyspark.sql.types import StructType,StructField, StringType        
schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
     ])
df2 = spark.createDataFrame(data = data, schema = schema)
df2.printSchema()
df2.show(truncate=False) # shows all columns

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+-----+------+
|name                  |state|gender|
+----------------------+-----+------+
|{James, null, Smith}  |OH   |M     |
|{Anna, Rose, }        |NY   |F     |
|{Julia, , Williams}   |OH   |F     |
|{Maria, Anne, Jones}  |NY   |M     |
|{Jen, Mary, Brown}    |NY   |M     |
|{Mike, Mary, Williams}|OH   |M     |
+----------------------+-----+------+



In [0]:
df2.select("name.*").show(truncate=False)

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
|James    |null      |Smith   |
|Anna     |Rose      |        |
|Julia    |          |Williams|
|Maria    |Anne      |Jones   |
|Jen      |Mary      |Brown   |
|Mike     |Mary      |Williams|
+---------+----------+--------+



In [0]:
df2.select("name").show(truncate=False)

+----------------------+
|name                  |
+----------------------+
|{James, null, Smith}  |
|{Anna, Rose, }        |
|{Julia, , Williams}   |
|{Maria, Anne, Jones}  |
|{Jen, Mary, Brown}    |
|{Mike, Mary, Williams}|
+----------------------+



In [0]:
df2.select("name.firstname","name.lastname").show(truncate=False)

+---------+--------+
|firstname|lastname|
+---------+--------+
|James    |Smith   |
|Anna     |        |
|Julia    |Williams|
|Maria    |Jones   |
|Jen      |Brown   |
|Mike     |Williams|
+---------+--------+



In [0]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.show(truncate=False)

df.select("firstname").show()

df.select("firstname","lastname").show()

#Using Dataframe object name
df.select(df.firstname,df.lastname).show()

# Using col function
from pyspark.sql.functions import col
df.select(col("firstname"),col("lastname")).show()

data = [(("James",None,"Smith"),"OH","M"),
        (("Anna","Rose",""),"NY","F"),
        (("Julia","","Williams"),"OH","F"),
        (("Maria","Anne","Jones"),"NY","M"),
        (("Jen","Mary","Brown"),"NY","M"),
        (("Mike","Mary","Williams"),"OH","M")
        ]

from pyspark.sql.types import StructType,StructField, StringType        
schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
     ])

df2 = spark.createDataFrame(data = data, schema = schema)
df2.printSchema()
df2.show(truncate=False) # shows all columns

df2.select("name").show(truncate=False)
df2.select("name.firstname","name.lastname").show(truncate=False)
df2.select("name.*").show(truncate=False)

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|James    |Smith   |USA    |CA   |
|Michael  |Rose    |USA    |NY   |
|Robert   |Williams|USA    |CA   |
|Maria    |Jones   |USA    |FL   |
+---------+--------+-------+-----+

+---------+
|firstname|
+---------+
|    James|
|  Michael|
|   Robert|
|    Maria|
+---------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string