In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('test').getOrCreate()

## Trabajo con Strings

In [20]:
df = spark.read.parquet('/content/convertir')

In [6]:
df.printSchema()

root
 |-- date: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- date_str: string (nullable = true)
 |-- ts_str: string (nullable = true)



In [17]:
df.show(truncate=False)

+----------+-----------------------+----------+----------------+
|date      |timestamp              |date_str  |ts_str          |
+----------+-----------------------+----------+----------------+
|2021-01-01|2021-01-01 20:10:50.723|01-01-2021|18-08-2021 46:58|
+----------+-----------------------+----------+----------------+



In [None]:
from pyspark.sql.functions import col, to_date, to_timestamp

In [21]:
from pyspark.sql import functions as F

In [24]:
df2 = df.select(
    F.to_date(F.col('date')).alias('date1'),
    F.to_timestamp(F.col('timestamp')).alias('ts1'),
    F.to_date(F.col('date_str'), 'dd-MM-yyyy').alias('date2'),
    F.to_timestamp(F.col('ts_str'), 'dd-MM-yyyy mm:ss').alias('ts2')
)

In [28]:
df2.printSchema()
df2.show(truncate=False)

root
 |-- date1: date (nullable = true)
 |-- ts1: timestamp (nullable = true)
 |-- date2: date (nullable = true)
 |-- ts2: timestamp (nullable = true)

+----------+-----------------------+----------+-------------------+
|date1     |ts1                    |date2     |ts2                |
+----------+-----------------------+----------+-------------------+
|2021-01-01|2021-01-01 20:10:50.723|2021-01-01|2021-08-18 00:46:58|
+----------+-----------------------+----------+-------------------+



In [33]:
df2.select(
    F.date_format(F.col('date1'),"dd-MM-yyyy")
).show()

+------------------------------+
|date_format(date1, dd-MM-yyyy)|
+------------------------------+
|                    01-01-2021|
+------------------------------+



In [48]:
df = spark.read.parquet('/content/calculo.parquet')

In [49]:
df.printSchema()

root
 |-- nombre: string (nullable = true)
 |-- fecha_ingreso: string (nullable = true)
 |-- fecha_salida: string (nullable = true)
 |-- baja_sistema: string (nullable = true)



In [50]:
df.select(
    F.date_diff(F.col('fecha_ingreso'),F.col('fecha_salida')).alias('diff ingreso-salida'),
    F.months_between(F.col('fecha_ingreso'),F.col('fecha_salida')).alias('diff ingreso-salida months'),
    F.last_day(F.col('fecha_salida')).alias('last day')
).show()

+-------------------+--------------------------+----------+
|diff ingreso-salida|diff ingreso-salida months|  last day|
+-------------------+--------------------------+----------+
|               -317|              -10.41935484|2021-11-30|
|               -292|               -9.61290323|2021-11-30|
+-------------------+--------------------------+----------+



In [44]:
df.show()

+------+-------------+------------+-------------------+
|nombre|fecha_ingreso|fecha_salida|       baja_sistema|
+------+-------------+------------+-------------------+
|  Jose|   2021-01-01|  2021-11-14|2021-10-14 15:35:59|
|Mayara|   2021-02-06|  2021-11-25|2021-11-25 10:35:55|
+------+-------------+------------+-------------------+



In [46]:
df.select(
    F.date_add(F.col('fecha_ingreso'),14).alias('mas 14 dias'),
    F.date_sub(F.col('fecha_ingreso'),14).alias('menos 14 dias')
).show()

+-----------+-------------+
|mas 14 dias|menos 14 dias|
+-----------+-------------+
| 2021-01-15|   2020-12-18|
| 2021-02-20|   2021-01-23|
+-----------+-------------+



In [54]:
coldate = F.col('baja_sistema')
df.select(
    coldate,
    F.year(coldate),
    F.month(coldate),
    F.dayofmonth(coldate),
    F.dayofweek(coldate),
    F.dayofyear(coldate),
    F.hour(coldate),
    F.minute(coldate),
    F.second(coldate)
).show()

+-------------------+------------------+-------------------+------------------------+-----------------------+-----------------------+------------------+--------------------+--------------------+
|       baja_sistema|year(baja_sistema)|month(baja_sistema)|dayofmonth(baja_sistema)|dayofweek(baja_sistema)|dayofyear(baja_sistema)|hour(baja_sistema)|minute(baja_sistema)|second(baja_sistema)|
+-------------------+------------------+-------------------+------------------------+-----------------------+-----------------------+------------------+--------------------+--------------------+
|2021-10-14 15:35:59|              2021|                 10|                      14|                      5|                    287|                15|                  35|                  59|
|2021-11-25 10:35:55|              2021|                 11|                      25|                      5|                    329|                10|                  35|                  55|
+-------------------+----

## Funciones para strings

In [None]:
data = spark.read.parquet('./data/')

data.show()

from pyspark.sql.functions import ltrim, rtrim, trim

data.select(
    ltrim('nombre').alias('ltrim'),
    rtrim('nombre').alias('rtrim'),
    trim('nombre').alias('trim')
).show()

In [62]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [69]:
data = [(1, "Abel   ","Bustamante"), (2, "    Ana","Salazar")]
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("nombre", StringType(), True),
    StructField("apellido", StringType(), True)
])
df = spark.createDataFrame(data, schema)

df.show()

+---+-------+----------+
| id| nombre|  apellido|
+---+-------+----------+
|  1|Abel   |Bustamante|
|  2|    Ana|   Salazar|
+---+-------+----------+



In [None]:
from pyspark.sql.functions import ltrim, rtrim, trim

In [71]:
df.select(
    F.rtrim(F.col('nombre')).alias('nombre'),
    F.trim(F.col('apellido')).alias('apellido')
).show()

+-------+----------+
| nombre|  apellido|
+-------+----------+
|   Abel|Bustamante|
|    Ana|   Salazar|
+-------+----------+



In [74]:
## Rellenar con caracter en especifico
from pyspark.sql.functions import col, lpad, rpad

In [79]:
df.select(
    lpad(col('nombre'),15,'*').alias('lpad'),
    rpad(col('nombre'),15,'*').alias('rpad')
).show(truncate=False)

+---------------+---------------+
|lpad           |rpad           |
+---------------+---------------+
|********Abel   |Abel   ********|
|********    Ana|    Ana********|
+---------------+---------------+



In [85]:
sc = StructType([
    StructField('palabra1',StringType(),True),
    StructField('palabra2',StringType(),True),
    StructField('palabra3',StringType(),True)
])
df1 = spark.createDataFrame(
    [('Spark','es','maravilloso'),
     ('sujeto','verbo','predicado')
     ],schema=sc
)

df1.show()

+--------+--------+-----------+
|palabra1|palabra2|   palabra3|
+--------+--------+-----------+
|   Spark|      es|maravilloso|
|  sujeto|   verbo|  predicado|
+--------+--------+-----------+



In [90]:
df1.select(
    F.concat_ws('***',F.col('palabra1'),F.col('palabra2'),F.col('palabra3')).alias('concat'),
    F.lower(F.col('palabra1')).alias('lower'),
    F.upper(F.col('palabra2')).alias('upper'),
    F.initcap(F.col('palabra3')).alias('initcap'),
    F.reverse(F.col('palabra3')).alias('reverse')
).show(truncate=False)

+--------------------------+------+-----+-----------+-----------+
|concat                    |lower |upper|initcap    |reverse    |
+--------------------------+------+-----+-----------+-----------+
|Spark***es***maravilloso  |spark |ES   |Maravilloso|osollivaram|
|sujeto***verbo***predicado|sujeto|VERBO|Predicado  |odaciderp  |
+--------------------------+------+-----+-----------+-----------+



In [94]:
df2 = spark.createDataFrame([('abro mi puerta con mi llave',)],['frase'])
df2.show(truncate=False)

+---------------------------+
|frase                      |
+---------------------------+
|abro mi puerta con mi llave|
+---------------------------+



In [96]:
df2.select(
    F.regexp_replace(col('frase'),'abro|con','remplazado').alias('nuevo valor')
).show(truncate=False)

+----------------------------------------+
|nuevo valor                             |
+----------------------------------------+
|remplazado mi puerta remplazado mi llave|
+----------------------------------------+



## Trabajo con colecciones

In [97]:
df = spark.read.parquet('/content/parquet/part-00000-96f39196-ef97-4a14-926e-b24a86c2e32d-c000.snappy.parquet')

In [100]:
df.show(truncate=False)

+-----+--------------------------------------------+
|dia  |tareas                                      |
+-----+--------------------------------------------+
|lunes|[hacer la tarea, buscar agua, lavar el auto]|
+-----+--------------------------------------------+



In [99]:
df.printSchema()

root
 |-- dia: string (nullable = true)
 |-- tareas: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [106]:
from pyspark.sql.functions import col,size,sort_array,array_contains,explode

In [104]:
df.select(
    size(col('tareas')).alias('tamaño'),
    sort_array(col('tareas')).alias('arreglo_ordenado'),
    array_contains(col('tareas'),'buscar agua').alias('contains')
).show(truncate=False)

+------+--------------------------------------------+--------+
|tamaño|arreglo_ordenado                            |contains|
+------+--------------------------------------------+--------+
|3     |[buscar agua, hacer la tarea, lavar el auto]|true    |
+------+--------------------------------------------+--------+



In [115]:
df.printSchema()

root
 |-- dia: string (nullable = true)
 |-- tareas: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [116]:
df.show(truncate=False)

+-----+--------------------------------------------+
|dia  |tareas                                      |
+-----+--------------------------------------------+
|lunes|[hacer la tarea, buscar agua, lavar el auto]|
+-----+--------------------------------------------+



In [118]:
schema = StructType([
    StructField('dia',StringType(),True),
    StructField('tareas',F.ArrayType(StringType(), True),True)
])
df2 = spark.createDataFrame([('lunes',['tarea1','tarea2']),('martes',['tarea3','tarea4'])],schema=schema)

In [119]:
df2.show()

+------+----------------+
|   dia|          tareas|
+------+----------------+
| lunes|[tarea1, tarea2]|
|martes|[tarea3, tarea4]|
+------+----------------+



In [120]:
df2.select(
    col('dia'),
    explode(col('tareas')).alias('tarea')
).show(truncate=False)

+------+------+
|dia   |tarea |
+------+------+
|lunes |tarea1|
|lunes |tarea2|
|martes|tarea3|
|martes|tarea4|
+------+------+



In [129]:
json_df_str = spark.read.parquet('/content/JSON')

In [130]:
json_df_str.show(truncate=False)

+---------------------------------------------------------------------------+
|tareas_str                                                                 |
+---------------------------------------------------------------------------+
|{"dia": "lunes","tareas": ["hacer la tarea","buscar agua","lavar el auto"]}|
+---------------------------------------------------------------------------+



In [131]:
json_df_str.printSchema()

root
 |-- tareas_str: string (nullable = true)



In [132]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

schema_json = StructType(
    [
     StructField('dia', StringType(), True),
     StructField('tareas', ArrayType(StringType()), True)
    ]
)

In [133]:
from pyspark.sql.functions import from_json, to_json

json_df = json_df_str.select(
    from_json(col('tareas_str'), schema_json).alias('por_hacer')
)


In [138]:
json_df.printSchema()

root
 |-- por_hacer: struct (nullable = true)
 |    |-- dia: string (nullable = true)
 |    |-- tareas: array (nullable = true)
 |    |    |-- element: string (containsNull = true)



In [137]:
json_df.show(truncate=False)

+-----------------------------------------------------+
|por_hacer                                            |
+-----------------------------------------------------+
|{lunes, [hacer la tarea, buscar agua, lavar el auto]}|
+-----------------------------------------------------+



In [134]:
json_df.select(
    col('por_hacer').getItem('dia'),
    col('por_hacer').getItem('tareas'),
    col('por_hacer').getItem('tareas').getItem(0).alias('primer_tarea')
).show(truncate=False)

+-------------+--------------------------------------------+--------------+
|por_hacer.dia|por_hacer.tareas                            |primer_tarea  |
+-------------+--------------------------------------------+--------------+
|lunes        |[hacer la tarea, buscar agua, lavar el auto]|hacer la tarea|
+-------------+--------------------------------------------+--------------+



In [139]:
json_df.select(
    to_json(col('por_hacer'))
).show(truncate=False)


+-------------------------------------------------------------------------+
|to_json(por_hacer)                                                       |
+-------------------------------------------------------------------------+
|{"dia":"lunes","tareas":["hacer la tarea","buscar agua","lavar el auto"]}|
+-------------------------------------------------------------------------+



##  Mas Funciones

In [140]:
df = spark.read.parquet('/content/part-00000-a9b42845-6edf-4329-996e-2528aa78bb4a-c000.snappy.parquet')

In [141]:
df.show()

+------+----+
|nombre|pago|
+------+----+
|  Jose|   1|
| Julia|   2|
| Katia|   1|
|  NULL|   3|
|  Raul|   3|
+------+----+



In [145]:
df.select(
    col('nombre'),
    F.when(col('pago')==1,'pagado').when(col('pago')==2,'no pagado').otherwise('no se').alias('pago')
).show()

+------+---------+
|nombre|     pago|
+------+---------+
|  Jose|   pagado|
| Julia|no pagado|
| Katia|   pagado|
|  NULL|    no se|
|  Raul|    no se|
+------+---------+



In [155]:
df.select(
    col('nombre'),
    F.lpad(F.coalesce(col('nombre'),F.lit('no name')),12,"*")
).show()

+------+--------------------------------------+
|nombre|lpad(coalesce(nombre, no name), 12, *)|
+------+--------------------------------------+
|  Jose|                          ********Jose|
| Julia|                          *******Julia|
| Katia|                          *******Katia|
|  NULL|                          *****no name|
|  Raul|                          ********Raul|
+------+--------------------------------------+

