In [0]:
import pyspark.sql.functions as f

In [0]:
df = spark.read.format("csv").option("header", True).load("/FileStore/tables/store_sales/holidays_events.csv")

In [0]:
df = spark.read.format("csv").option("header", "true").load("/FileStore/tables/store_sales/holidays_events.csv")

Out[4]: DataFrame[date: string, type: string, locale: string, locale_name: string, description: string, transferred: string]

### Concat

In [0]:
help(f.concat)

Help on function concat in module pyspark.sql.functions:

concat(*cols: 'ColumnOrName') -> pyspark.sql.column.Column
    Concatenates multiple input columns together into a single column.
    The function works with strings, binary and compatible array columns.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
    >>> df.select(concat(df.s, df.d).alias('s')).collect()
    [Row(s='abcd123')]
    
    >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])
    >>> df.select(concat(df.a, df.b, df.c).alias("arr")).collect()
    [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]



In [0]:
df.withColumn("concat_col", f.concat("type", f.lit("-"), "locale")).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+----------------+
|      date|   type|  locale|  locale_name|         description|transferred|      concat_col|
+----------+-------+--------+-------------+--------------------+-----------+----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Holiday-Local|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Holiday-Regional|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Holiday-Local|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Holiday-Local|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Holiday-Local|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Holiday-Local|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Holiday-Local|
|2012-06-25|Holiday|Regional|     Imbabura|Provincializacion

In [0]:
df.withColumn("concat_col", f.concat(f.col("type"), f.lit("-"), f.col("locale"))).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+----------------+
|      date|   type|  locale|  locale_name|         description|transferred|      concat_col|
+----------+-------+--------+-------------+--------------------+-----------+----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Holiday-Local|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Holiday-Regional|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Holiday-Local|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Holiday-Local|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Holiday-Local|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Holiday-Local|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Holiday-Local|
|2012-06-25|Holiday|Regional|     Imbabura|Provincializacion

In [0]:
df.withColumn("concat_col", f.concat(df["type"], f.lit("-"), df["locale"])).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+----------------+
|      date|   type|  locale|  locale_name|         description|transferred|      concat_col|
+----------+-------+--------+-------------+--------------------+-----------+----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Holiday-Local|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Holiday-Regional|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Holiday-Local|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Holiday-Local|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Holiday-Local|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Holiday-Local|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Holiday-Local|
|2012-06-25|Holiday|Regional|     Imbabura|Provincializacion

In [0]:
df.withColumn("concat_col", f.concat("type", f.lit("-"), df["locale"])).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+----------------+
|      date|   type|  locale|  locale_name|         description|transferred|      concat_col|
+----------+-------+--------+-------------+--------------------+-----------+----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Holiday-Local|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Holiday-Regional|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Holiday-Local|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Holiday-Local|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Holiday-Local|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Holiday-Local|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Holiday-Local|
|2012-06-25|Holiday|Regional|     Imbabura|Provincializacion

In [0]:
df.withColumn("concat_col", f.concat(f.col("type"), f.lit("-"), "locale")).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+----------------+
|      date|   type|  locale|  locale_name|         description|transferred|      concat_col|
+----------+-------+--------+-------------+--------------------+-----------+----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Holiday-Local|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Holiday-Regional|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Holiday-Local|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Holiday-Local|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Holiday-Local|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Holiday-Local|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Holiday-Local|
|2012-06-25|Holiday|Regional|     Imbabura|Provincializacion

### concat_ws
Here we are going to perform the same operation as above but the only difference being, ```lit``` is not used. ```concat_ws``` allows us to specify the separator which needs to be used for concatinating columns into a single column. As you can see from the below cell, the first argument is going to be the separator while the following arguments will be a valid column names/column objects.

In [0]:
help(f.concat_ws)

Help on function concat_ws in module pyspark.sql.functions:

concat_ws(sep: str, *cols: 'ColumnOrName') -> pyspark.sql.column.Column
    Concatenates multiple input string columns together into a single string column,
    using the given separator.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
    >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect()
    [Row(s='abcd-123')]



In [0]:
df.withColumn("concat_col", f.concat_ws("-", "type","locale")).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+----------------+
|      date|   type|  locale|  locale_name|         description|transferred|      concat_col|
+----------+-------+--------+-------------+--------------------+-----------+----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Holiday-Local|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Holiday-Regional|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Holiday-Local|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Holiday-Local|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Holiday-Local|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Holiday-Local|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Holiday-Local|
|2012-06-25|Holiday|Regional|     Imbabura|Provincializacion

In [0]:
df.withColumn("concat_col", f.concat_ws("**", df["type"],df["locale"])).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+-----------------+
|      date|   type|  locale|  locale_name|         description|transferred|       concat_col|
+----------+-------+--------+-------------+--------------------+-----------+-----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Holiday**Local|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Holiday**Regional|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Holiday**Local|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Holiday**Local|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Holiday**Local|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Holiday**Local|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Holiday**Local|
|2012-06-25|Holiday|Regional|     Imbabura|Provinc

In [0]:
df.withColumn("concat_col", f.concat_ws("**", *["locale","type"])).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+-----------------+
|      date|   type|  locale|  locale_name|         description|transferred|       concat_col|
+----------+-------+--------+-------------+--------------------+-----------+-----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Local**Holiday|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Regional**Holiday|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Local**Holiday|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Local**Holiday|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Local**Holiday|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Local**Holiday|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Local**Holiday|
|2012-06-25|Holiday|Regional|     Imbabura|Provinc

In [0]:
df.withColumn("concat_col", f.concat_ws("**", *[df["locale"],df["type"]])).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+-----------------+
|      date|   type|  locale|  locale_name|         description|transferred|       concat_col|
+----------+-------+--------+-------------+--------------------+-----------+-----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Local**Holiday|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Regional**Holiday|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Local**Holiday|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Local**Holiday|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Local**Holiday|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Local**Holiday|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Local**Holiday|
|2012-06-25|Holiday|Regional|     Imbabura|Provinc

In [0]:
df.withColumn("concat_col", f.concat_ws("**", "locale",df["type"])).select("*").show()

+----------+-------+--------+-------------+--------------------+-----------+-----------------+
|      date|   type|  locale|  locale_name|         description|transferred|       concat_col|
+----------+-------+--------+-------------+--------------------+-----------+-----------------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|   Local**Holiday|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|Regional**Holiday|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|   Local**Holiday|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|   Local**Holiday|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|   Local**Holiday|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|   Local**Holiday|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|   Local**Holiday|
|2012-06-25|Holiday|Regional|     Imbabura|Provinc

### lower

In [0]:
help(f.lower)

Help on function lower in module pyspark.sql.functions:

lower(col: 'ColumnOrName') -> pyspark.sql.column.Column
    Converts a string expression to lower case.
    
    .. versionadded:: 1.5



In [0]:
df.withColumn("lower_locale_name", f.lower("locale_name")).select(["locale_name","lower_locale_name"]).show()

+-------------+-----------------+
|  locale_name|lower_locale_name|
+-------------+-----------------+
|        Manta|            manta|
|     Cotopaxi|         cotopaxi|
|       Cuenca|           cuenca|
|     Libertad|         libertad|
|     Riobamba|         riobamba|
|         Puyo|             puyo|
|     Guaranda|         guaranda|
|     Imbabura|         imbabura|
|    Latacunga|        latacunga|
|      Machala|          machala|
|Santo Domingo|    santo domingo|
|    El Carmen|        el carmen|
|      Cayambe|          cayambe|
|   Esmeraldas|       esmeraldas|
|      Ecuador|          ecuador|
|     Riobamba|         riobamba|
|       Ambato|           ambato|
|       Ibarra|           ibarra|
|      Quevedo|          quevedo|
|      Ecuador|          ecuador|
+-------------+-----------------+
only showing top 20 rows



In [0]:
df.withColumn("lower_locale_name", f.lower(f.col("locale_name"))).select(["locale_name","lower_locale_name"]).show()

+-------------+-----------------+
|  locale_name|lower_locale_name|
+-------------+-----------------+
|        Manta|            manta|
|     Cotopaxi|         cotopaxi|
|       Cuenca|           cuenca|
|     Libertad|         libertad|
|     Riobamba|         riobamba|
|         Puyo|             puyo|
|     Guaranda|         guaranda|
|     Imbabura|         imbabura|
|    Latacunga|        latacunga|
|      Machala|          machala|
|Santo Domingo|    santo domingo|
|    El Carmen|        el carmen|
|      Cayambe|          cayambe|
|   Esmeraldas|       esmeraldas|
|      Ecuador|          ecuador|
|     Riobamba|         riobamba|
|       Ambato|           ambato|
|       Ibarra|           ibarra|
|      Quevedo|          quevedo|
|      Ecuador|          ecuador|
+-------------+-----------------+
only showing top 20 rows



In [0]:
df.withColumn("lower_locale_name", f.lower(df["locale_name"])).select(["locale_name","lower_locale_name"]).show()

+-------------+-----------------+
|  locale_name|lower_locale_name|
+-------------+-----------------+
|        Manta|            manta|
|     Cotopaxi|         cotopaxi|
|       Cuenca|           cuenca|
|     Libertad|         libertad|
|     Riobamba|         riobamba|
|         Puyo|             puyo|
|     Guaranda|         guaranda|
|     Imbabura|         imbabura|
|    Latacunga|        latacunga|
|      Machala|          machala|
|Santo Domingo|    santo domingo|
|    El Carmen|        el carmen|
|      Cayambe|          cayambe|
|   Esmeraldas|       esmeraldas|
|      Ecuador|          ecuador|
|     Riobamba|         riobamba|
|       Ambato|           ambato|
|       Ibarra|           ibarra|
|      Quevedo|          quevedo|
|      Ecuador|          ecuador|
+-------------+-----------------+
only showing top 20 rows



### Upper

In [0]:
help(f.upper)

Help on function upper in module pyspark.sql.functions:

upper(col: 'ColumnOrName') -> pyspark.sql.column.Column
    Converts a string expression to upper case.
    
    .. versionadded:: 1.5



In [0]:
df.withColumn("upper_locale_name", f.upper("locale_name")).select(*["locale_name","upper_locale_name"]).show()

+-------------+-----------------+
|  locale_name|upper_locale_name|
+-------------+-----------------+
|        Manta|            MANTA|
|     Cotopaxi|         COTOPAXI|
|       Cuenca|           CUENCA|
|     Libertad|         LIBERTAD|
|     Riobamba|         RIOBAMBA|
|         Puyo|             PUYO|
|     Guaranda|         GUARANDA|
|     Imbabura|         IMBABURA|
|    Latacunga|        LATACUNGA|
|      Machala|          MACHALA|
|Santo Domingo|    SANTO DOMINGO|
|    El Carmen|        EL CARMEN|
|      Cayambe|          CAYAMBE|
|   Esmeraldas|       ESMERALDAS|
|      Ecuador|          ECUADOR|
|     Riobamba|         RIOBAMBA|
|       Ambato|           AMBATO|
|       Ibarra|           IBARRA|
|      Quevedo|          QUEVEDO|
|      Ecuador|          ECUADOR|
+-------------+-----------------+
only showing top 20 rows



In [0]:
df.withColumn("upper_locale_name", f.upper(f.col("locale_name"))).select(*["locale_name","upper_locale_name"]).show()

+-------------+-----------------+
|  locale_name|upper_locale_name|
+-------------+-----------------+
|        Manta|            MANTA|
|     Cotopaxi|         COTOPAXI|
|       Cuenca|           CUENCA|
|     Libertad|         LIBERTAD|
|     Riobamba|         RIOBAMBA|
|         Puyo|             PUYO|
|     Guaranda|         GUARANDA|
|     Imbabura|         IMBABURA|
|    Latacunga|        LATACUNGA|
|      Machala|          MACHALA|
|Santo Domingo|    SANTO DOMINGO|
|    El Carmen|        EL CARMEN|
|      Cayambe|          CAYAMBE|
|   Esmeraldas|       ESMERALDAS|
|      Ecuador|          ECUADOR|
|     Riobamba|         RIOBAMBA|
|       Ambato|           AMBATO|
|       Ibarra|           IBARRA|
|      Quevedo|          QUEVEDO|
|      Ecuador|          ECUADOR|
+-------------+-----------------+
only showing top 20 rows



In [0]:
df.withColumn("upper_locale_name", f.upper(df["locale_name"])).select(["locale_name","upper_locale_name"]).show()

+-------------+-----------------+
|  locale_name|upper_locale_name|
+-------------+-----------------+
|        Manta|            MANTA|
|     Cotopaxi|         COTOPAXI|
|       Cuenca|           CUENCA|
|     Libertad|         LIBERTAD|
|     Riobamba|         RIOBAMBA|
|         Puyo|             PUYO|
|     Guaranda|         GUARANDA|
|     Imbabura|         IMBABURA|
|    Latacunga|        LATACUNGA|
|      Machala|          MACHALA|
|Santo Domingo|    SANTO DOMINGO|
|    El Carmen|        EL CARMEN|
|      Cayambe|          CAYAMBE|
|   Esmeraldas|       ESMERALDAS|
|      Ecuador|          ECUADOR|
|     Riobamba|         RIOBAMBA|
|       Ambato|           AMBATO|
|       Ibarra|           IBARRA|
|      Quevedo|          QUEVEDO|
|      Ecuador|          ECUADOR|
+-------------+-----------------+
only showing top 20 rows



### Chaining lower/upper to be applied on multiple columns

In [0]:
df.withColumn("type", f.upper("type")).\
withColumn("locale_name", f.upper("locale_name")).\
withColumn("locale", f.lower("locale")).\
select("*").
\show()

+----------+-------+--------+-------------+--------------------+-----------+
|      date|   type|  locale|  locale_name|         description|transferred|
+----------+-------+--------+-------------+--------------------+-----------+
|2012-03-02|HOLIDAY|   local|        MANTA|  Fundacion de Manta|      False|
|2012-04-01|HOLIDAY|regional|     COTOPAXI|Provincializacion...|      False|
|2012-04-12|HOLIDAY|   local|       CUENCA| Fundacion de Cuenca|      False|
|2012-04-14|HOLIDAY|   local|     LIBERTAD|Cantonizacion de ...|      False|
|2012-04-21|HOLIDAY|   local|     RIOBAMBA|Cantonizacion de ...|      False|
|2012-05-12|HOLIDAY|   local|         PUYO|Cantonizacion del...|      False|
|2012-06-23|HOLIDAY|   local|     GUARANDA|Cantonizacion de ...|      False|
|2012-06-25|HOLIDAY|regional|     IMBABURA|Provincializacion...|      False|
|2012-06-25|HOLIDAY|   local|    LATACUNGA|Cantonizacion de ...|      False|
|2012-06-25|HOLIDAY|   local|      MACHALA|Fundacion de Machala|      False|

+----------+-------+--------+-------------+--------------------+-----------+
|      date|   type|  locale|  locale_name|         description|transferred|
+----------+-------+--------+-------------+--------------------+-----------+
|2012-03-02|Holiday|   Local|        Manta|  Fundacion de Manta|      False|
|2012-04-01|Holiday|Regional|     Cotopaxi|Provincializacion...|      False|
|2012-04-12|Holiday|   Local|       Cuenca| Fundacion de Cuenca|      False|
|2012-04-14|Holiday|   Local|     Libertad|Cantonizacion de ...|      False|
|2012-04-21|Holiday|   Local|     Riobamba|Cantonizacion de ...|      False|
|2012-05-12|Holiday|   Local|         Puyo|Cantonizacion del...|      False|
|2012-06-23|Holiday|   Local|     Guaranda|Cantonizacion de ...|      False|
|2012-06-25|Holiday|Regional|     Imbabura|Provincializacion...|      False|
|2012-06-25|Holiday|   Local|    Latacunga|Cantonizacion de ...|      False|
|2012-06-25|Holiday|   Local|      Machala|Fundacion de Machala|      False|

### inticap

In [0]:
help(f.initcap)

Help on function initcap in module pyspark.sql.functions:

initcap(col: 'ColumnOrName') -> pyspark.sql.column.Column
    Translate the first letter of each word to upper case in the sentence.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect()
    [Row(v='Ab Cd')]



In [0]:
df.withColumn("description_initcap", f.initcap("description")).select("description","description_initcap").show()

+--------------------+--------------------+
|         description| description_initcap|
+--------------------+--------------------+
|  Fundacion de Manta|  Fundacion De Manta|
|Provincializacion...|Provincializacion...|
| Fundacion de Cuenca| Fundacion De Cuenca|
|Cantonizacion de ...|Cantonizacion De ...|
|Cantonizacion de ...|Cantonizacion De ...|
|Cantonizacion del...|Cantonizacion Del...|
|Cantonizacion de ...|Cantonizacion De ...|
|Provincializacion...|Provincializacion...|
|Cantonizacion de ...|Cantonizacion De ...|
|Fundacion de Machala|Fundacion De Machala|
|Fundacion de Sant...|Fundacion De Sant...|
|Cantonizacion de ...|Cantonizacion De ...|
|Cantonizacion de ...|Cantonizacion De ...|
|Fundacion de Esme...|Fundacion De Esme...|
|Primer Grito de I...|Primer Grito De I...|
|Fundacion de Riob...|Fundacion De Riob...|
| Fundacion de Ambato| Fundacion De Ambato|
| Fundacion de Ibarra| Fundacion De Ibarra|
|Cantonizacion de ...|Cantonizacion De ...|
|Independencia de ...|Independen

In [0]:
df.withColumn("description_initcap", f.initcap(f.col("description"))).select("description","description_initcap").show()

+--------------------+--------------------+
|         description| description_initcap|
+--------------------+--------------------+
|  Fundacion de Manta|  Fundacion De Manta|
|Provincializacion...|Provincializacion...|
| Fundacion de Cuenca| Fundacion De Cuenca|
|Cantonizacion de ...|Cantonizacion De ...|
|Cantonizacion de ...|Cantonizacion De ...|
|Cantonizacion del...|Cantonizacion Del...|
|Cantonizacion de ...|Cantonizacion De ...|
|Provincializacion...|Provincializacion...|
|Cantonizacion de ...|Cantonizacion De ...|
|Fundacion de Machala|Fundacion De Machala|
|Fundacion de Sant...|Fundacion De Sant...|
|Cantonizacion de ...|Cantonizacion De ...|
|Cantonizacion de ...|Cantonizacion De ...|
|Fundacion de Esme...|Fundacion De Esme...|
|Primer Grito de I...|Primer Grito De I...|
|Fundacion de Riob...|Fundacion De Riob...|
| Fundacion de Ambato| Fundacion De Ambato|
| Fundacion de Ibarra| Fundacion De Ibarra|
|Cantonizacion de ...|Cantonizacion De ...|
|Independencia de ...|Independen

In [0]:
df.withColumn("description_initcap", f.initcap(df["description"])).select("description","description_initcap").show()

+--------------------+--------------------+
|         description| description_initcap|
+--------------------+--------------------+
|  Fundacion de Manta|  Fundacion De Manta|
|Provincializacion...|Provincializacion...|
| Fundacion de Cuenca| Fundacion De Cuenca|
|Cantonizacion de ...|Cantonizacion De ...|
|Cantonizacion de ...|Cantonizacion De ...|
|Cantonizacion del...|Cantonizacion Del...|
|Cantonizacion de ...|Cantonizacion De ...|
|Provincializacion...|Provincializacion...|
|Cantonizacion de ...|Cantonizacion De ...|
|Fundacion de Machala|Fundacion De Machala|
|Fundacion de Sant...|Fundacion De Sant...|
|Cantonizacion de ...|Cantonizacion De ...|
|Cantonizacion de ...|Cantonizacion De ...|
|Fundacion de Esme...|Fundacion De Esme...|
|Primer Grito de I...|Primer Grito De I...|
|Fundacion de Riob...|Fundacion De Riob...|
| Fundacion de Ambato| Fundacion De Ambato|
| Fundacion de Ibarra| Fundacion De Ibarra|
|Cantonizacion de ...|Cantonizacion De ...|
|Independencia de ...|Independen

### Length

In [0]:
help(f.length)

Help on function length in module pyspark.sql.functions:

length(col: 'ColumnOrName') -> pyspark.sql.column.Column
    Computes the character length of string data or number of bytes of binary data.
    The length of character data includes the trailing spaces. The length of binary data
    includes binary zeros.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect()
    [Row(length=4)]



In [0]:
df.withColumn("locale_name_len", f.length("locale_name")).select("locale_name", "locale_name_len").show()

+-------------+---------------+
|  locale_name|locale_name_len|
+-------------+---------------+
|        Manta|              5|
|     Cotopaxi|              8|
|       Cuenca|              6|
|     Libertad|              8|
|     Riobamba|              8|
|         Puyo|              4|
|     Guaranda|              8|
|     Imbabura|              8|
|    Latacunga|              9|
|      Machala|              7|
|Santo Domingo|             13|
|    El Carmen|              9|
|      Cayambe|              7|
|   Esmeraldas|             10|
|      Ecuador|              7|
|     Riobamba|              8|
|       Ambato|              6|
|       Ibarra|              6|
|      Quevedo|              7|
|      Ecuador|              7|
+-------------+---------------+
only showing top 20 rows



In [0]:
df.withColumn("locale_name_len", f.length(df["locale_name"])).select("locale_name", "locale_name_len").show()

+-------------+---------------+
|  locale_name|locale_name_len|
+-------------+---------------+
|        Manta|              5|
|     Cotopaxi|              8|
|       Cuenca|              6|
|     Libertad|              8|
|     Riobamba|              8|
|         Puyo|              4|
|     Guaranda|              8|
|     Imbabura|              8|
|    Latacunga|              9|
|      Machala|              7|
|Santo Domingo|             13|
|    El Carmen|              9|
|      Cayambe|              7|
|   Esmeraldas|             10|
|      Ecuador|              7|
|     Riobamba|              8|
|       Ambato|              6|
|       Ibarra|              6|
|      Quevedo|              7|
|      Ecuador|              7|
+-------------+---------------+
only showing top 20 rows



In [0]:
df.withColumn("locale_name_len", f.length(f.col("locale_name"))).select("locale_name", "locale_name_len").show()

+-------------+---------------+
|  locale_name|locale_name_len|
+-------------+---------------+
|        Manta|              5|
|     Cotopaxi|              8|
|       Cuenca|              6|
|     Libertad|              8|
|     Riobamba|              8|
|         Puyo|              4|
|     Guaranda|              8|
|     Imbabura|              8|
|    Latacunga|              9|
|      Machala|              7|
|Santo Domingo|             13|
|    El Carmen|              9|
|      Cayambe|              7|
|   Esmeraldas|             10|
|      Ecuador|              7|
|     Riobamba|              8|
|       Ambato|              6|
|       Ibarra|              6|
|      Quevedo|              7|
|      Ecuador|              7|
+-------------+---------------+
only showing top 20 rows

