## Purpose of script:
#### Reviewing Spark basics
#### Referencing Jose Portilla's "Spark and Python for Big Data with PySpark" course

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [3]:
df = spark.read.json('../Datasets/people.json')

df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [4]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [5]:
df.columns

['age', 'name']

In [6]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [7]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



#### Casting data types in Spark

In [9]:
from pyspark.sql.types import (StructField, StructType, 
                               StringType, IntegerType)

In [10]:
# True is for allowing for null values
data_schema = [StructField('age', IntegerType(), True),
               StructField('name', StringType(), True)]

In [11]:
specified_struc = StructType(fields=data_schema)

In [12]:
df = spark.read.json('../Datasets/people.json', schema=specified_struc)

In [13]:
# column types casted correctly
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [14]:
# this gets a column object
type(df['age'])

pyspark.sql.column.Column

In [15]:
# this gets the column from the dataframe
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [16]:
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [17]:
# getting multiple columns
df.select(['age', 'name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [18]:
# adding a calculated column
# this does NOT affect the original dataframe
df.withColumn('double_age', df['age']*2).show()

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



In [22]:
# renames an existing column
# this does NOT affect the original dataframe
df.withColumnRenamed('age', 'new_age').show()

+-------+-------+
|new_age|   name|
+-------+-------+
|   null|Michael|
|     30|   Andy|
|     19| Justin|
+-------+-------+



In [23]:
# no columns were affected via the above withCol statements
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



#### SQL operations

In [24]:
# take df and create a temporary view called people
df.createOrReplaceTempView('people')

In [25]:
results = spark.sql('SELECT * FROM people WHERE age >= 30')

In [26]:
results.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



## Demo

In [30]:
test_data = [
    {'name': 'Tasha', 'age': 20, 'profession': 'student'},
    {'name': 'Jim', 'age': 45, 'profession': 'accountant'},
    {'name': 'Tim', 'age': 18, 'profession': 'student'},
    {'name': 'Jane', 'age': 26, 'profession': 'seo manager'},
]

In [32]:
test_data = [
    ('Tasha', 20, 'student'),
    ('Jim', 45, 'accountant'),
    ('Ted', 18, 'student'),
    ('Jane', 26, 'seo manager')
]

In [34]:
test_col = ['name', 'age', 'profession']

In [36]:
df = spark.createDataFrame(data=test_data, schema=test_col)
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- profession: string (nullable = true)

+-----+---+-----------+
|name |age|profession |
+-----+---+-----------+
|Tasha|20 |student    |
|Jim  |45 |accountant |
|Ted  |18 |student    |
|Jane |26 |seo manager|
+-----+---+-----------+



In [37]:
# create a temporary view from dataframe
df.createOrReplaceTempView('people')

In [38]:
results = spark.sql('SELECT * FROM people WHERE age >= 21')
results.show()

+----+---+-----------+
|name|age| profession|
+----+---+-----------+
| Jim| 45| accountant|
|Jane| 26|seo manager|
+----+---+-----------+



In [39]:
spark.stop()