In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_schema = 'order_id long, order_date date, customer_id long, order_status string'
orders_df = spark.read. \
format('csv'). \
schema(orders_schema). \
load('/public/trendytech/orders/orders_1gb.csv')

In [3]:
orders_df.show()

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
|       5|2013-07-25|      11318|       COMPLETE|
|       6|2013-07-25|       7130|       COMPLETE|
|       7|2013-07-25|       4530|       COMPLETE|
|       8|2013-07-25|       2911|     PROCESSING|
|       9|2013-07-25|       5657|PENDING_PAYMENT|
|      10|2013-07-25|       5648|PENDING_PAYMENT|
|      11|2013-07-25|        918| PAYMENT_REVIEW|
|      12|2013-07-25|       1837|         CLOSED|
|      13|2013-07-25|       9149|PENDING_PAYMENT|
|      14|2013-07-25|       9842|     PROCESSING|
|      15|2013-07-25|       2568|       COMPLETE|
|      16|2013-07-25|       7276|PENDING_PAYMENT|
|      17|2013-07-25|       2667|       COMPLETE|


### 1) To view all the columns of DF

In [4]:
orders_df.select("*").show()

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
|       5|2013-07-25|      11318|       COMPLETE|
|       6|2013-07-25|       7130|       COMPLETE|
|       7|2013-07-25|       4530|       COMPLETE|
|       8|2013-07-25|       2911|     PROCESSING|
|       9|2013-07-25|       5657|PENDING_PAYMENT|
|      10|2013-07-25|       5648|PENDING_PAYMENT|
|      11|2013-07-25|        918| PAYMENT_REVIEW|
|      12|2013-07-25|       1837|         CLOSED|
|      13|2013-07-25|       9149|PENDING_PAYMENT|
|      14|2013-07-25|       9842|     PROCESSING|
|      15|2013-07-25|       2568|       COMPLETE|
|      16|2013-07-25|       7276|PENDING_PAYMENT|
|      17|2013-07-25|       2667|       COMPLETE|


### 2) To view Specific Columns we use: column String expression
#### We're trying to acess column using string notation.

In [5]:
orders_df.select("order_id","order_date").show()

+--------+----------+
|order_id|order_date|
+--------+----------+
|       1|2013-07-25|
|       2|2013-07-25|
|       3|2013-07-25|
|       4|2013-07-25|
|       5|2013-07-25|
|       6|2013-07-25|
|       7|2013-07-25|
|       8|2013-07-25|
|       9|2013-07-25|
|      10|2013-07-25|
|      11|2013-07-25|
|      12|2013-07-25|
|      13|2013-07-25|
|      14|2013-07-25|
|      15|2013-07-25|
|      16|2013-07-25|
|      17|2013-07-25|
|      18|2013-07-25|
|      19|2013-07-25|
|      20|2013-07-25|
+--------+----------+
only showing top 20 rows



### We can also use: 
#### 3) df.select(df.column_name1,df.column_name2)

In [6]:
from pyspark.sql.functions import *

In [7]:
orders_df.select(orders_df.order_id,orders_df.order_date).show()

+--------+----------+
|order_id|order_date|
+--------+----------+
|       1|2013-07-25|
|       2|2013-07-25|
|       3|2013-07-25|
|       4|2013-07-25|
|       5|2013-07-25|
|       6|2013-07-25|
|       7|2013-07-25|
|       8|2013-07-25|
|       9|2013-07-25|
|      10|2013-07-25|
|      11|2013-07-25|
|      12|2013-07-25|
|      13|2013-07-25|
|      14|2013-07-25|
|      15|2013-07-25|
|      16|2013-07-25|
|      17|2013-07-25|
|      18|2013-07-25|
|      19|2013-07-25|
|      20|2013-07-25|
+--------+----------+
only showing top 20 rows



### 4) using column Object:
#### df.select(column('column_name'))
#### df.select(col('column_name'))
#### df.select(expr('column_name'))

In [11]:
orders_df.select(column('customer_id'),col('customer_id'), expr('order_status')).show()

+-----------+-----------+---------------+
|customer_id|customer_id|   order_status|
+-----------+-----------+---------------+
|      11599|      11599|         CLOSED|
|        256|        256|PENDING_PAYMENT|
|      12111|      12111|       COMPLETE|
|       8827|       8827|         CLOSED|
|      11318|      11318|       COMPLETE|
|       7130|       7130|       COMPLETE|
|       4530|       4530|       COMPLETE|
|       2911|       2911|     PROCESSING|
|       5657|       5657|PENDING_PAYMENT|
|       5648|       5648|PENDING_PAYMENT|
|        918|        918| PAYMENT_REVIEW|
|       1837|       1837|         CLOSED|
|       9149|       9149|PENDING_PAYMENT|
|       9842|       9842|     PROCESSING|
|       2568|       2568|       COMPLETE|
|       7276|       7276|PENDING_PAYMENT|
|       2667|       2667|       COMPLETE|
|       1205|       1205|         CLOSED|
|       9488|       9488|PENDING_PAYMENT|
|       9198|       9198|     PROCESSING|
+-----------+-----------+---------

#### "order_id" = Column String
####  orders_df.order_date: 
####  column('cust_id') / col('cust_id') = column object
####  expr("order_status") = column expression