In [2]:
# Spark imports.
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [3]:
# Start the Spark session.
spark = SparkSession.builder.appName('ops').getOrCreate()

Exception: Java gateway process exited before sending its port number

In [9]:
# We can infer the schema/types (only in CSV), and header tells us
# that the first row are the names of the columns.
df = spark.read.csv('Data/DataFrames/appl_stock.csv', inferSchema=True, header=True)

In [10]:
# Number of rows we read.
df.count()

1762

In [12]:
# See what schema was inferred (together with column names from row).
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [14]:
# We can see more detailed what a row object contains, for example the
# 'Date' field is a datetime object with its parameters, when printed
# we'll see it converted. Head 3 gets the first 3 elements and then
# we extract the first element of that list.
df.head(3)[0]

Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)

In [18]:
# We'll see the element at the top, notice that the datetime object
# has been converted into a readable string representation.
df.show(1)

+-------------------+----------+----------+------------------+----------+---------+---------+
|               Date|      Open|      High|               Low|     Close|   Volume|Adj Close|
+-------------------+----------+----------+------------------+----------+---------+---------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|214.009998|123432400|27.727039|
+-------------------+----------+----------+------------------+----------+---------+---------+
only showing top 1 row



In [24]:
# The best part of working with dataframes is being able to filter
# data based on certain conditions, which can be similar to operations
# we do when operating data storages.

# filter function call with a SQL like syntax. But instead lets use
# data frames operators for the rest of the course!
df.filter("Close < 500").select(['Open', 'Close']).show()

+------------------+------------------+
|              Open|             Close|
+------------------+------------------+
|        213.429998|        214.009998|
|        214.599998|        214.379993|
|        214.379993|        210.969995|
|            211.75|            210.58|
|        210.299994|211.98000499999998|
|212.79999700000002|210.11000299999998|
|209.18999499999998|        207.720001|
|        207.870005|        210.650002|
|210.11000299999998|            209.43|
|210.92999500000002|            205.93|
|        208.330002|        215.039995|
|        214.910006|            211.73|
|        212.079994|        208.069996|
|206.78000600000001|            197.75|
|202.51000200000001|        203.070002|
|205.95000100000001|        205.940001|
|        206.849995|        207.880005|
|        204.930004|        199.289995|
|        201.079996|        192.060003|
|192.36999699999998|        194.729998|
+------------------+------------------+
only showing top 20 rows



In [32]:
# filter function call with dataframe operations.
df.filter(df['Close'] < 500).select('Volume').count()

1359

In [34]:
# For multiple conditions, you can't use regular 'and' 'or'.
# ValueError: Cannot convert column into bool: please use 
# '&' for 'and', '|' for 'or', '~' for 'not' when building
# DataFrame boolean expressions.
# Also remember to put all conditions in parentheses.
df.filter( (f.col('Close') < 200) & ~(f.col('Open') > 200) ).count()

658

In [40]:
# Collect will put results into a list of Row objects.
results = df.filter( f.col('Low') == 197.16 ).collect()
print(results[0])

Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)


In [43]:
# This will convert the Row object into a dictionary!
results[0].asDict()

{'Date': datetime.datetime(2010, 1, 22, 0, 0),
 'Open': 206.78000600000001,
 'High': 207.499996,
 'Low': 197.16,
 'Close': 197.75,
 'Volume': 220441900,
 'Adj Close': 25.620401}