In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Creating the SparkSession, very important before doing any action in spark
spark = SparkSession.\
    builder.\
    appName('SparkSQL').\
    getOrCreate()

In [11]:

# Read a flat file indicating to infer the schema
db = spark.read.format('csv').\
    option('inferSchema','true').\
    option('header','true').\
    option('path','operations_management.csv').\
    load()

In [12]:
db.printSchema()

root
 |-- description: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- level: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- line_code: string (nullable = true)
 |-- value: integer (nullable = true)



In [14]:
# Now, we gonna do transformation using:
#   a) API DataFrame
#   b) Spark SQL query    
# 'total' is the value of the col 'industry', not a colname
db_T1 = db.select("industry","value").\
filter((col("value") > 200) & (col("industry") != "total")).\
orderBy(desc("value"))

In [16]:
db_T1.show(5)

+--------------------+-----+
|            industry|value|
+--------------------+-----+
|        Construction| 6030|
|        Construction| 5904|
|        Construction| 5229|
|Accommodation & f...| 5058|
|        Construction| 4965|
+--------------------+-----+
only showing top 5 rows



In [17]:
db_T1.printSchema()

root
 |-- industry: string (nullable = true)
 |-- value: integer (nullable = true)



In [18]:
# b) Same query but using spark.sql and SQL sentences
db_T1.createOrReplaceTempView('db_view') # need to create a view in order to be allowed to query the data
# data in a spark view can be only showed after being queried as below
spark.sql("""SELECT *   
             FROM db_view 
             WHERE value > 200 
                AND industry != 'total' """).show()

+--------------------+-----+
|            industry|value|
+--------------------+-----+
|        Construction| 6030|
|        Construction| 5904|
|        Construction| 5229|
|Accommodation & f...| 5058|
|        Construction| 4965|
|        Construction| 4959|
|Accommodation & f...| 4950|
|        Construction| 4686|
|        Construction| 4668|
|        Construction| 4665|
|       Manufacturing| 4662|
|       Manufacturing| 4632|
|        Construction| 4575|
|        Construction| 4566|
|Professional, sci...| 4476|
|Professional, sci...| 4470|
|        Retail trade| 4434|
|        Retail trade| 4434|
|Accommodation & f...| 4251|
|Accommodation & f...| 4176|
+--------------------+-----+
only showing top 20 rows



In [19]:
# b) Same query but using spark.sql and SQL sentences
db_T1.createOrReplaceTempView('db_view').show()

AttributeError: 'NoneType' object has no attribute 'show'