In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_df = spark.read \
.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.load("/public/trendytech/orders_wh/*")

In [3]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

### Note: Please check the databases in your lab id

In [4]:
spark.sql("show databases").filter("namespace like 'itv006277_ret%'")

namespace
itv006277_retail


In [5]:
spark.sql("use itv006277_retail")

In [6]:
spark.sql("show tables")

database,tableName,isTemporary
itv006277_retail,orders,False
itv006277_retail,orders_ext,False


In [7]:
orders_df = spark.sql("select * from orders_ext where order_status = 'CLOSED'")

In [8]:
orders_df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|       1|2013-07-25 00:00:...|      11599|      CLOSED|
|       4|2013-07-25 00:00:...|       8827|      CLOSED|
|      12|2013-07-25 00:00:...|       1837|      CLOSED|
|      18|2013-07-25 00:00:...|       1205|      CLOSED|
|      24|2013-07-25 00:00:...|      11441|      CLOSED|
|      25|2013-07-25 00:00:...|       9503|      CLOSED|
|      37|2013-07-25 00:00:...|       5863|      CLOSED|
|      51|2013-07-25 00:00:...|      12271|      CLOSED|
|      57|2013-07-25 00:00:...|       7073|      CLOSED|
|      61|2013-07-25 00:00:...|       4791|      CLOSED|
|      62|2013-07-25 00:00:...|       9111|      CLOSED|
|      87|2013-07-25 00:00:...|       3065|      CLOSED|
|      90|2013-07-25 00:00:...|       9131|      CLOSED|
|     101|2013-07-25 00:00:...|       5116|      CLOSED|
|     116|2013-07-26 00:00:...|

In [9]:
df = spark.table("itv006277_retail.orders_ext")

In [10]:
df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [11]:
spark.range(0,8, 2)

id
0
2
4
6


In [12]:
! hadoop fs -cat /public/trendytech/retail_db/orders/part-00000 | head

1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT
cat: Unable to write to output stream.


In [13]:
orders_list = [(1,'2013-07-25 00:00:00.0',11599,'CLOSED'),
(2,'2013-07-25 00:00:00.0',256,'PENDING_PAYMENT'),
(3,'2013-07-25 00:00:00.0',12111,'COMPLETE')
]

In [14]:
orders_raw_df = spark.createDataFrame(orders_list)

In [15]:
orders_raw_df.show()

+---+--------------------+-----+---------------+
| _1|                  _2|   _3|             _4|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:...|12111|       COMPLETE|
+---+--------------------+-----+---------------+



In [16]:
orders_raw_df.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)
 |-- _4: string (nullable = true)



In [17]:
orders_raw_df = spark.createDataFrame(orders_list).toDF('order_id', 'order_date','customer_id', 'order_status')

In [18]:
orders_raw_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [19]:
orders_raw_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [20]:
orders_raw_df = spark.createDataFrame(orders_list)

In [21]:
orders_df = orders_raw_df.toDF('order_id', 'order_date','customer_id', 'order_status')

In [22]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [23]:
new_df = orders_df.toDF('order_id','order_date','customer_id','orderstatus')

In [24]:
new_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|    orderstatus|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [25]:
orders_schema = ["order_id", "order_date", "cust_id", "order_status"]

In [26]:
df = spark.createDataFrame(orders_list, orders_schema)

In [27]:
df.show()

+--------+--------------------+-------+---------------+
|order_id|          order_date|cust_id|   order_status|
+--------+--------------------+-------+---------------+
|       1|2013-07-25 00:00:...|  11599|         CLOSED|
|       2|2013-07-25 00:00:...|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|  12111|       COMPLETE|
+--------+--------------------+-------+---------------+



In [28]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [29]:
orders_schema = 'order_id long, order_date string, cust_id int, order_status string'

In [30]:
df = spark.createDataFrame(orders_list, orders_schema)

In [31]:
df.show()

+--------+--------------------+-------+---------------+
|order_id|          order_date|cust_id|   order_status|
+--------+--------------------+-------+---------------+
|       1|2013-07-25 00:00:...|  11599|         CLOSED|
|       2|2013-07-25 00:00:...|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|  12111|       COMPLETE|
+--------+--------------------+-------+---------------+



In [32]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [33]:
from pyspark.sql.functions import to_timestamp

In [34]:
new_df = df.withColumn("order_date", to_timestamp('order_date'))

In [35]:
new_df.show()

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|  12111|       COMPLETE|
+--------+-------------------+-------+---------------+

