In [2]:
import findspark as fs
fs.init()
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan, when, count, col, translate, lower
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql import functions as F
spark=SparkSession.builder.appName('GoogleData').master('local').getOrCreate()
sc=spark.sparkContext
sql=SQLContext(sc)

Error: Jupyter cannot be started. Error attempting to locate jupyter: 'Kernelspec' module not installed in the selected interpreter (D:\Documents\pySpark\miniCONDA\envs\pySpark\python.exe).
 Please re-install or update 'jupyter'.

In [3]:
spark

In [4]:
orders = spark.read.csv('retail_db//orders.csv',header=True,inferSchema=True)
order_items = spark.read.csv('retail_db//order_items.csv',header=True,inferSchema=True)
customers = spark.read.csv('retail_db//customers.csv',header=True,inferSchema=True)
products = spark.read.csv('retail_db//products.csv',header=True,inferSchema=True)

# select

In [4]:
orders.select(orders.order_status,'order_status',col('order_status')).show()

+---------------+---------------+---------------+
|   order_status|   order_status|   order_status|
+---------------+---------------+---------------+
|         CLOSED|         CLOSED|         CLOSED|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
|       COMPLETE|       COMPLETE|       COMPLETE|
|         CLOSED|         CLOSED|         CLOSED|
|       COMPLETE|       COMPLETE|       COMPLETE|
|       COMPLETE|       COMPLETE|       COMPLETE|
|       COMPLETE|       COMPLETE|       COMPLETE|
|     PROCESSING|     PROCESSING|     PROCESSING|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
| PAYMENT_REVIEW| PAYMENT_REVIEW| PAYMENT_REVIEW|
|         CLOSED|         CLOSED|         CLOSED|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
|     PROCESSING|     PROCESSING|     PROCESSING|
|       COMPLETE|       COMPLETE|       COMPLETE|
|PENDING_PAYMENT|PENDING_PAYMENT|PENDING_PAYMENT|
|       COMPLETE|       COMPLETE|       COMPLETE|


#### But while using functions it is applicable to specify the full notation, i.e., dataframe.column_name or using col as below

In [11]:
orders.select(lower(orders.order_status),lower(col('order_status'))).show()
#Note: orders.select(lower('order_status')).show() will not work

+-------------------+-------------------+
|lower(order_status)|lower(order_status)|
+-------------------+-------------------+
|             closed|             closed|
|    pending_payment|    pending_payment|
|           complete|           complete|
|             closed|             closed|
|           complete|           complete|
|           complete|           complete|
|           complete|           complete|
|         processing|         processing|
|    pending_payment|    pending_payment|
|    pending_payment|    pending_payment|
|     payment_review|     payment_review|
|             closed|             closed|
|    pending_payment|    pending_payment|
|         processing|         processing|
|           complete|           complete|
|    pending_payment|    pending_payment|
|           complete|           complete|
|             closed|             closed|
|    pending_payment|    pending_payment|
|         processing|         processing|
+-------------------+-------------

# alias
#### alias should be enclosed within select 

### Note : Please do not give alias the same name as built-in functions 

In [12]:
orders.select(orders.order_status.alias('Status_alias')).show()
#Note: orders.select('order_status').alias('Status_alias').show()  will not work

+---------------+
|   Status_alias|
+---------------+
|         CLOSED|
|PENDING_PAYMENT|
|       COMPLETE|
|         CLOSED|
|       COMPLETE|
|       COMPLETE|
|       COMPLETE|
|     PROCESSING|
|PENDING_PAYMENT|
|PENDING_PAYMENT|
| PAYMENT_REVIEW|
|         CLOSED|
|PENDING_PAYMENT|
|     PROCESSING|
|       COMPLETE|
|PENDING_PAYMENT|
|       COMPLETE|
|         CLOSED|
|PENDING_PAYMENT|
|     PROCESSING|
+---------------+
only showing top 20 rows



In [14]:
orders.select(col('order_status').alias('Status_alias')).distinct().show()
#Note: orders.select('order_status').distinct().alias('Status_alias').show() will not work

+---------------+
|   Status_alias|
+---------------+
|PENDING_PAYMENT|
|       COMPLETE|
|        ON_HOLD|
| PAYMENT_REVIEW|
|     PROCESSING|
|         CLOSED|
|SUSPECTED_FRAUD|
|        PENDING|
|       CANCELED|
+---------------+



# withcolumn
#### If already existing column name is given, then the new column expression will replace the existing column. (It will not add another column)


In [25]:
orders = orders. \
withColumn('order_id',orders.order_id.cast('bigint')). \
withColumn('order_date',orders.order_date.cast('date')). \
withColumn('order_customer_id',orders.order_customer_id.cast('bigint')). \
withColumn('order_status',orders.order_status.cast('string')) 


In [24]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



#### Given below are the cast notations from pyspark.sql.types

In [None]:
BinaryType: binary
BooleanType: boolean
ByteType: tinyint
DateType: date
DecimalType: decimal(10,0)
DoubleType: double
FloatType: float
IntegerType: int
LongType: bigint
ShortType: smallint
StringType: string
TimestampType: timestamp

# selectExpr

#### SQL like expressions can be used for evaluation

In [29]:
orders.selectExpr('order_id||","||order_date||","||order_customer_id||","||order_status||"," as textdata ').show(1)

+--------------------+
|            textdata|
+--------------------+
|1,2013-07-25,1159...|
+--------------------+
only showing top 1 row



# case
#### CASE as in SQL can be used

In [30]:
orders.selectExpr('CASE WHEN order_status in ("COMPLETE","CLOSED") THEN "COMPLETELD" WHEN order_status = "CANCELED" THEN "CANCEL" ELSE "NONE" END Derived_status').show()

+--------------+
|Derived_status|
+--------------+
|    COMPLETELD|
|          NONE|
|    COMPLETELD|
|    COMPLETELD|
|    COMPLETELD|
|    COMPLETELD|
|    COMPLETELD|
|          NONE|
|          NONE|
|          NONE|
|          NONE|
|    COMPLETELD|
|          NONE|
|          NONE|
|    COMPLETELD|
|          NONE|
|    COMPLETELD|
|    COMPLETELD|
|          NONE|
|          NONE|
+--------------+
only showing top 20 rows



# when
#### Dataframe equivalent of case 

In [32]:
orders.withColumn('Derived_Status',when(orders.order_status.isin('COMPLETED','CLOSED'),'OVER'). \
                  when(orders.order_status.isin('PENDING_PAYMENT'),'PENDING').otherwise('NONE')).show()

+--------+----------+-----------------+---------------+--------------+
|order_id|order_date|order_customer_id|   order_status|Derived_Status|
+--------+----------+-----------------+---------------+--------------+
|       1|2013-07-25|            11599|         CLOSED|          OVER|
|       2|2013-07-25|              256|PENDING_PAYMENT|       PENDING|
|       3|2013-07-25|            12111|       COMPLETE|          NONE|
|       4|2013-07-25|             8827|         CLOSED|          OVER|
|       5|2013-07-25|            11318|       COMPLETE|          NONE|
|       6|2013-07-25|             7130|       COMPLETE|          NONE|
|       7|2013-07-25|             4530|       COMPLETE|          NONE|
|       8|2013-07-25|             2911|     PROCESSING|          NONE|
|       9|2013-07-25|             5657|PENDING_PAYMENT|       PENDING|
|      10|2013-07-25|             5648|PENDING_PAYMENT|       PENDING|
|      11|2013-07-25|              918| PAYMENT_REVIEW|          NONE|
|     

# withColumnRenamed
#### columns can be renamed with this API, where the first argument is the existing column name and the second argument is the new name.

In [None]:
orders.withColumnRenamed('order_status','status_of_order').show()

# filter

#### filter as in SQL format

In [None]:
orders.filter("order_status = 'COMPLETE'").show()
#Note: Single '=' as in SQL format

#### filter as in Dataframe format

In [None]:
orders.filter(orders.order_status == 'COMPLETE').show()
#Note: Equality operator '==' for dataframe filter expression 

#### multiple filters in SQL format filter

In [None]:
orders.filter("order_status = 'COMPLETE' OR order_status = 'CLOSED'").show()

#### multiple filters in DataFrame format

In [None]:
orders.filter((orders.order_status == 'COMPLETE').__or__(orders.order_status == 'CLOSED')).show()

###### Examples of few multiple filters

In [None]:
orders.filter("order_status in ('COMPLETE' ,'CLOSED') AND date_format(order_date,'yyyyMM') = '201308'").show()

In [None]:
orders.filter((orders.order_status.isin('CLOSED','COMPLETE')).__and__(date_format(orders.order_date,'YYYYMM')=='201308')).show()

In [None]:
orders.filter('order_customer_id >= 1000').filter('order_customer_id != 1000').filter(orders.order_id.between(1000,1999)).show()

# join

#### simple inner join with one column mapping

In [None]:
orders.join(order_items, orders.order_id==order_items.order_item_order_id).show()

#### inner join with multiple columns

In [None]:
orders.join(order_items, (orders.order_id==order_items.order_item_order_id) & (orders.order_id==order_items.order_item_order_id)).show()

#### left join

In [None]:
customers.join(orders, customers.customer_id==orders.order_customer_id, 'left').show()

#### right join

In [None]:
customers.join(orders, customers.customer_id==orders.order_customer_id, 'right').show()

#### leftanti join - Join to fetch records which exists only in one table
In the below case it will fetch the records which are present only in the orders and not in order_items

In [None]:
customers.join(orders, orders.order_customer_id==customers.customer_id, 'leftanti').show()

#### crossJoin

In [None]:
orders.crossJoin(customers.filter(customers.customer_id == 1)).show()

# distinct
#### distinct will come at the end after the select

In [None]:
orders.select('order_status').distinct().show()

# countDinstinct
#### countDistinct will be handy 

In [None]:
orders.select(countDistinct('order_status')).show()

# orderBy/sort
#### Note: sort is just an alias to orderBy

In [None]:
orders.orderBy(orders.order_date, orders.order_status.desc()).show()

Note: Only absolute column name notation or col() works while mentioning in descending order desc()

In [None]:
orders.orderBy('order_date',col('order_status').desc()).show()

# drop

drop() will only take just the column names and it will not take any other expressions for the column

In [None]:
order_items.join(products,order_items.order_item_product_id==products.product_id).drop('product_price','product_description','product_image').show()

In [None]:
#order_items.join(products, order_item_product_id==products.product_id).drop(products.product_price, products.product_description, products.product_image).show()
# Specifying full name of the column will not work 