In [1]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

In [None]:
spark = SparkSession.builder.appName('AApp').getOrCreate()

In [5]:
data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]
columns = ["firstname","lastname","country","state"]

In [7]:
df = spark.createDataFrame(data=data, schema=columns)

In [9]:
df.count()

4

In [10]:
df.show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



In [13]:
df.select('firstname','lastname').show(90)

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



In [15]:
df.select( df.firstname, df.lastname, df.state).show(3)

+---------+--------+-----+
|firstname|lastname|state|
+---------+--------+-----+
|    James|   Smith|   CA|
|  Michael|    Rose|   NY|
|   Robert|Williams|   CA|
+---------+--------+-----+
only showing top 3 rows



In [16]:
# user col() function
from pyspark.sql.functions import col
df.select(df.colRegex("`^.*name*`")).show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



In [18]:
# select all  columns
df.select(*columns).show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



In [19]:
df.select("*").show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



In [22]:
#replace string column values
# using regexp_replace()
from pyspark.sql.functions import regexp_replace

In [23]:
df.withColumn('lastname00', regexp_replace('lastname','Rose', 'Rosenew' )).show()

+---------+--------+-------+-----+----------+
|firstname|lastname|country|state|lastname00|
+---------+--------+-------+-----+----------+
|    James|   Smith|    USA|   CA|     Smith|
|  Michael|    Rose|    USA|   NY|   Rosenew|
|   Robert|Williams|    USA|   CA|  Williams|
|    Maria|   Jones|    USA|   FL|     Jones|
+---------+--------+-------+-----+----------+



In [24]:
df.show()


+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



In [28]:
df3 = df.withColumn('newState', df.state)  # add a new column

In [29]:
df3.show()


+---------+--------+-------+-----+--------+
|firstname|lastname|country|state|newState|
+---------+--------+-------+-----+--------+
|    James|   Smith|    USA|   CA|      CA|
|  Michael|    Rose|    USA|   NY|      NY|
|   Robert|Williams|    USA|   CA|      CA|
|    Maria|   Jones|    USA|   FL|      FL|
+---------+--------+-------+-----+--------+



In [31]:
from pyspark.sql.functions import col,lit

df4 = df.withColumn('newState2', lit('AconstantState'))

In [32]:
df4.show()

+---------+--------+-------+-----+--------------+
|firstname|lastname|country|state|     newState2|
+---------+--------+-------+-----+--------------+
|    James|   Smith|    USA|   CA|AconstantState|
|  Michael|    Rose|    USA|   NY|AconstantState|
|   Robert|Williams|    USA|   CA|AconstantState|
|    Maria|   Jones|    USA|   FL|AconstantState|
+---------+--------+-------+-----+--------------+



In [35]:
#drop column
df4.drop('newState2').show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



+---------+--------+-------+-----+--------------+
|firstname|lastname|country|state|     newState2|
+---------+--------+-------+-----+--------------+
|    James|   Smith|    USA|   CA|AconstantState|
|  Michael|    Rose|    USA|   NY|AconstantState|
|   Robert|Williams|    USA|   CA|AconstantState|
|    Maria|   Jones|    USA|   FL|AconstantState|
+---------+--------+-------+-----+--------------+

