In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import (StructField,StringType, 
                               IntegerType,StructType)

In [2]:
# Start the Spark session.
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [3]:
# Read from a path relative to the one where the Jupyter program was
# started. Can also read from large distributed files like from HDFS,
# this may involve large datasets in Amazon EC2.
df = spark.read.json('Data/people.json')

In [4]:
# Missing data will be replaced with `null`.
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
# Check the schema of the dataframe.
df.printSchema()

In [None]:
# Check the column names. Its not a function but an attribute so
# it can be called without ().
df.columns

In [None]:
# Count of elements in the dataframe.
df.count()

In [None]:
# Statistical summary of the dataframe.
df.describe().show()

In [None]:
data_schema = [
    # Name, type, and whether it can be null.
    StructField('age', IntegerType(), True),
    StructField('name', StringType(), True)
]

In [None]:
final_struct = StructType(fields=data_schema)

In [None]:
# Read the JSON but using the schema we defined.
df = spark.read.json('Data/DataFrames/people.json', schema=final_struct)

In [None]:
# Now age can be a integer instead of a long. Also by enforcing
# a schema we can guarantee constraints.
df.printSchema()

In [None]:
df.show()

In [None]:
# How do we actually grab data from our dataframe?
# This only returns a column object.
type(df['age'])

In [None]:
# This will instead create a new data frame only with the
# column we want.
df.select('age').show()

In [None]:
# The type is a whole dataframe, not a column.
type(df.select('age'))

In [None]:
# This returns a list of row objects
type(df.head(2)[0])

In [None]:
# You can also select multiple columns!
df.select(['age', 'name']).show()

In [None]:
# You can add a new column, for each row it will take the value
# of the 'age' column object and multiply by two. This doesn't
# alter the original object.
df.withColumn('new_age', df['age'] * 2).show()

In [None]:
# This is a command to just rename columns and nothing else.
df.withColumnRenamed('age', 'new_age').show()

In [None]:
# A UDF (User defined function) will be a function that returns
# a value then converted into a Spark type (e.g. StringType)
# to be used in operations. For example if you want to create
# a new column, you can't just put the values in there, it has
# to contain a Column object, which this will return.
concat_udf = f.udf(lambda age, name: "{}.{}".format(age, name), StringType())

In [None]:
# If you want to add a new column but also drop an old one.
# Use the UDF to concatenate age and name into a value that
# will be put into the new column.
df.withColumn('info', concat_udf(df.age, df.name)).drop('age')

In [None]:
# Now, lets do some SQL operations! But you can't just write the
# name of the python variable of a dataframe, so lets "temporarily"
# alias it to a table name.
df.createOrReplaceTempView('people')

In [None]:
results = spark.sql('''
   SELECT *
   FROM people
   WHERE age >= 25 AND age <= 35
''')

In [None]:
results.show()