In [None]:
import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark = SparkSession \
        .builder \
        .appName("Ejemplo de Spark") \
        .getOrCreate()

spark

In [11]:
df = spark.read.csv("../data/IBEX35")
df.show()

# Does not read headers

+------+----------+--------+---------+------+------------+--------+--------+--------+--------+
|   _c0|       _c1|     _c2|      _c3|   _c4|         _c5|     _c6|     _c7|     _c8|     _c9|
+------+----------+--------+---------+------+------------+--------+--------+--------+--------+
|Ticker|      Date|   Close|Reference|Volume|    Turnover|    Last|    High|     Low| Average|
|  AENA|02/01/2019|137.0000| 135.7500|143458| 19591073.50|137.0000|138.0000|133.9500|136.5631|
|  AENA|03/01/2019|137.0500| 137.0000|461151| 63059756.45|137.0500|139.1500|136.4000|137.4490|
|  AENA|04/01/2019|140.5000| 137.0500|110214| 15444845.65|140.5000|140.9500|137.7000|140.1351|
|  AENA|07/01/2019|139.1500| 140.5000| 92663| 12908476.55|139.1500|141.1000|138.2000|139.3056|
|  AENA|08/01/2019|140.0000| 139.1500|179857| 25206724.88|140.0000|141.0500|138.8000|140.1306|
|  AENA|09/01/2019|138.5000| 140.0000|163959| 22859110.75|138.5000|141.6500|138.3500|138.8855|
|  AENA|10/01/2019|139.2500| 138.5000|181664| 2520

In [12]:
df = spark.read.option("header", "true").csv("../data/IBEX35")
df.show()

+------+----------+--------+---------+------+------------+--------+--------+--------+--------+
|Ticker|      Date|   Close|Reference|Volume|    Turnover|    Last|    High|     Low| Average|
+------+----------+--------+---------+------+------------+--------+--------+--------+--------+
|  AENA|02/01/2019|137.0000| 135.7500|143458| 19591073.50|137.0000|138.0000|133.9500|136.5631|
|  AENA|03/01/2019|137.0500| 137.0000|461151| 63059756.45|137.0500|139.1500|136.4000|137.4490|
|  AENA|04/01/2019|140.5000| 137.0500|110214| 15444845.65|140.5000|140.9500|137.7000|140.1351|
|  AENA|07/01/2019|139.1500| 140.5000| 92663| 12908476.55|139.1500|141.1000|138.2000|139.3056|
|  AENA|08/01/2019|140.0000| 139.1500|179857| 25206724.88|140.0000|141.0500|138.8000|140.1306|
|  AENA|09/01/2019|138.5000| 140.0000|163959| 22859110.75|138.5000|141.6500|138.3500|138.8855|
|  AENA|10/01/2019|139.2500| 138.5000|181664| 25208702.51|139.2500|139.5000|137.5500|138.8824|
|  AENA|11/01/2019|139.4500| 139.2500|710391| 9912

In [9]:
print(f"The number of tickers is {df.count()}")
print(f"The data colums are {df.columns}")

The number of tickers is 8575
The data colums are ['Ticker', 'Date', 'Close', 'Reference', 'Volume', 'Turnover', 'Last', 'High', 'Low', 'Average']


In [13]:
df = df.withColumn("Aumento_valor", df.Close > df.Reference)
df.show()

+------+----------+--------+---------+------+------------+--------+--------+--------+--------+-------------+
|Ticker|      Date|   Close|Reference|Volume|    Turnover|    Last|    High|     Low| Average|Aumento_valor|
+------+----------+--------+---------+------+------------+--------+--------+--------+--------+-------------+
|  AENA|02/01/2019|137.0000| 135.7500|143458| 19591073.50|137.0000|138.0000|133.9500|136.5631|         true|
|  AENA|03/01/2019|137.0500| 137.0000|461151| 63059756.45|137.0500|139.1500|136.4000|137.4490|         true|
|  AENA|04/01/2019|140.5000| 137.0500|110214| 15444845.65|140.5000|140.9500|137.7000|140.1351|         true|
|  AENA|07/01/2019|139.1500| 140.5000| 92663| 12908476.55|139.1500|141.1000|138.2000|139.3056|        false|
|  AENA|08/01/2019|140.0000| 139.1500|179857| 25206724.88|140.0000|141.0500|138.8000|140.1306|         true|
|  AENA|09/01/2019|138.5000| 140.0000|163959| 22859110.75|138.5000|141.6500|138.3500|138.8855|        false|
|  AENA|10/01/2019|

In [14]:
df.filter(df.Ticker == "REP").show(5)

df.filter((df.Ticker == "REP") & (df.Aumento_valor == "true")).show(5)


+------+----------+-------+---------+--------+------------+-------+-------+-------+-------+-------------+
|Ticker|      Date|  Close|Reference|  Volume|    Turnover|   Last|   High|    Low|Average|Aumento_valor|
+------+----------+-------+---------+--------+------------+-------+-------+-------+-------+-------------+
|   REP|02/01/2019|14.2350|  14.0800|10003999|140210096.55|14.2350|14.3100|13.7850|14.0805|         true|
|   REP|03/01/2019|14.2550|  14.2350|12814838|180613397.06|14.2550|14.3800|14.0250|14.2255|         true|
|   REP|04/01/2019|14.5950|  14.2550| 7089672|103139023.91|14.5950|14.7050|14.3600|14.5892|         true|
|   REP|07/01/2019|14.4750|  14.5950| 4409347| 63898025.48|14.4750|14.6950|14.3300|14.4915|        false|
|   REP|08/01/2019|14.4900|  14.4750|12227338|177321179.12|14.4900|14.7900|14.4400|14.5760|         true|
+------+----------+-------+---------+--------+------------+-------+-------+-------+-------+-------------+
only showing top 5 rows

+------+----------+--

In [40]:
df.filter(df.Ticker == "IDR").show()
print(df.filter((df.Ticker == "IDR") & (df.Volume > 500000)).count())
# df.filter((df.Ticker == "IDR") & (df.Volume > 500000)).show(42)


+------+----------+------+---------+------+----------+------+------+------+-------+-------------+
|Ticker|      Date| Close|Reference|Volume|  Turnover|  Last|  High|   Low|Average|Aumento_valor|
+------+----------+------+---------+------+----------+------+------+------+-------+-------------+
|   IDR|02/01/2019|8.1250|   8.2350|554146|4504780.53|8.1250|8.2100|8.0000| 8.1292|        false|
|   IDR|03/01/2019|8.0000|   8.1250|518359|4154868.74|8.0000|8.1350|7.9200| 8.0154|        false|
|   IDR|04/01/2019|8.2050|   8.0000|539367|4389231.91|8.2050|8.2400|8.0400| 8.1377|         true|
|   IDR|07/01/2019|8.0500|   8.2050|776378|6226567.10|8.0500|8.2250|7.8900| 8.0200|        false|
|   IDR|08/01/2019|8.1850|   8.0500|547992|4480941.25|8.1850|8.2650|8.0550| 8.1770|         true|
|   IDR|09/01/2019|8.5800|   8.1850|882949|7544821.85|8.5800|8.7000|8.2500| 8.5442|         true|
|   IDR|10/01/2019|8.5350|   8.5800|351675|3001337.82|8.5350|8.5900|8.4800| 8.5344|        false|
|   IDR|11/01/2019|8