# PySpark String Functions and Concatenation

This notebook demonstrates various string functions in PySpark.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, concat_ws, initcap, lower, upper

# Create a Spark session
spark = SparkSession.builder.appName("SortingAndStringFunctions").getOrCreate()

# Sample data
data = [
    ("USA", "North America", 100, 50.5),
    ("India", "Asia", 300, 20.0),
    ("Germany", "Europe", 200, 30.5),
    ("Australia", "Oceania", 150, 60.0),
    ("Japan", "Asia", 120, 45.0),
    ("Brazil", "South America", 180, 25.0)
]

# Define the schema
columns = ["Country", "Region", "UnitsSold", "UnitPrice"]

# Create DataFrame
df = spark.createDataFrame(data, columns)
df.show()

StatementMeta(, e21b19ea-78d3-472c-8445-65bd56051f1d, 3, Finished, Available, Finished)

+---------+-------------+---------+---------+
|  Country|       Region|UnitsSold|UnitPrice|
+---------+-------------+---------+---------+
|      USA|North America|      100|     50.5|
|    India|         Asia|      300|     20.0|
|  Germany|       Europe|      200|     30.5|
|Australia|      Oceania|      150|     60.0|
|    Japan|         Asia|      120|     45.0|
|   Brazil|South America|      180|     25.0|
+---------+-------------+---------+---------+



## Convert the First Letter of Each Word to Uppercase

In [2]:
df_initcap = df.select(col("Country"), initcap(col("Country")).alias("Country_InitCap"))
df_initcap.show()

StatementMeta(, e21b19ea-78d3-472c-8445-65bd56051f1d, 4, Finished, Available, Finished)

+---------+---------------+
|  Country|Country_InitCap|
+---------+---------------+
|      USA|            Usa|
|    India|          India|
|  Germany|        Germany|
|Australia|      Australia|
|    Japan|          Japan|
|   Brazil|         Brazil|
+---------+---------------+



## Convert All Text to Lowercase

In [3]:
df_lower = df.select(col("Country"), lower(col("Country")).alias("Country_Lower"))
df_lower.show()

StatementMeta(, e21b19ea-78d3-472c-8445-65bd56051f1d, 5, Finished, Available, Finished)

+---------+-------------+
|  Country|Country_Lower|
+---------+-------------+
|      USA|          usa|
|    India|        india|
|  Germany|      germany|
|Australia|    australia|
|    Japan|        japan|
|   Brazil|       brazil|
+---------+-------------+



## Convert All Text to Uppercase

In [4]:
df_upper = df.select(col("Country"), upper(col("Country")).alias("Country_Upper"))
df_upper.show()

StatementMeta(, e21b19ea-78d3-472c-8445-65bd56051f1d, 6, Finished, Available, Finished)

+---------+-------------+
|  Country|Country_Upper|
+---------+-------------+
|      USA|          USA|
|    India|        INDIA|
|  Germany|      GERMANY|
|Australia|    AUSTRALIA|
|    Japan|        JAPAN|
|   Brazil|       BRAZIL|
+---------+-------------+



## Concatenate Two Columns Without Separator

In [5]:
df_concat = df.select(col("Region"), col("Country"), concat(col("Region"), col("Country")).alias("Concatenated"))
df_concat.show()

StatementMeta(, e21b19ea-78d3-472c-8445-65bd56051f1d, 7, Finished, Available, Finished)

+-------------+---------+-------------------+
|       Region|  Country|       Concatenated|
+-------------+---------+-------------------+
|North America|      USA|   North AmericaUSA|
|         Asia|    India|          AsiaIndia|
|       Europe|  Germany|      EuropeGermany|
|      Oceania|Australia|   OceaniaAustralia|
|         Asia|    Japan|          AsiaJapan|
|South America|   Brazil|South AmericaBrazil|
+-------------+---------+-------------------+



## Concatenate Two Columns With Separator

In [6]:
df_concat_sep = df.select(col("Region"), col("Country"), concat_ws(" | ", col("Region"), col("Country")).alias("Concatenated_With_Separator"))
df_concat_sep.show()

StatementMeta(, e21b19ea-78d3-472c-8445-65bd56051f1d, 8, Finished, Available, Finished)

+-------------+---------+---------------------------+
|       Region|  Country|Concatenated_With_Separator|
+-------------+---------+---------------------------+
|North America|      USA|        North America | USA|
|         Asia|    India|               Asia | India|
|       Europe|  Germany|           Europe | Germany|
|      Oceania|Australia|        Oceania | Australia|
|         Asia|    Japan|               Asia | Japan|
|South America|   Brazil|       South America | B...|
+-------------+---------+---------------------------+



## Create a New Concatenated Column

In [7]:
df_with_concat_column = df.withColumn("Region_Country", concat_ws(" ", col("Region"), col("Country")))
df_with_concat_column.show()

StatementMeta(, e21b19ea-78d3-472c-8445-65bd56051f1d, 9, Finished, Available, Finished)

+---------+-------------+---------+---------+--------------------+
|  Country|       Region|UnitsSold|UnitPrice|      Region_Country|
+---------+-------------+---------+---------+--------------------+
|      USA|North America|      100|     50.5|   North America USA|
|    India|         Asia|      300|     20.0|          Asia India|
|  Germany|       Europe|      200|     30.5|      Europe Germany|
|Australia|      Oceania|      150|     60.0|   Oceania Australia|
|    Japan|         Asia|      120|     45.0|          Asia Japan|
|   Brazil|South America|      180|     25.0|South America Brazil|
+---------+-------------+---------+---------+--------------------+

