# ITVERSITY CCA-175 Practice

## For LOCAL MACHINE SETUP

In [None]:
import findspark as fs
import os
fs.init()
fs.find()

data_path=os.path.dirname("D://Bigdata Tutorials//data//retail_db//")
data_path_json=os.path.dirname("D://Bigdata Tutorials//data//retail_db_json//")

from pyspark.sql import SparkSession,SQLContext,HiveContext
spark=SparkSession.builder.appName('ITVERSITY').master('local').getOrCreate()
sc=spark.sparkContext
sqlcontext=SQLContext(sc)
sc

## For VAGRANT SETUP

In [None]:
!start-dfs.sh
!start-yarn.sh

In [1]:
from pyspark.sql import SparkSession,SQLContext,HiveContext
spark=SparkSession.builder.appName('itVERSITY').master('yarn').getOrCreate()
sc=spark.sparkContext
sqlcontext=SQLContext(sc)
sc

In [None]:
!stop-dfs.sh
!stop-yarn.sh

## Let's LOAD some DATA
### LOCAL FILE

In [None]:
orderItems=sc.textFile(os.path.join(data_path,"order_items"))
orders=sc.textFile(os.path.join(data_path,"orders"))

### For VM

In [2]:
orderItems=sc.textFile("/public/retail_db/order_items")
orders=sc.textFile("/public/retail_db/orders")


### Let's find Some stuff out

In [3]:
# It contains OrderID
int(orderItems.first().split(",")[1])

1

In [4]:
# It contains Revenue per order Item
float(orderItems.first().split(",")[4])

299.98

In [5]:
orderItemsMap=orderItems.map(lambda x:(int(x.split(",")[1]),float(x.split(",")[4])))

In [6]:
orderItemsMap.take(5)

[(1, 299.98), (2, 199.99), (2, 250.0), (2, 129.99), (4, 49.98)]

### Now we can hold 1st element as KEY and reduce the Values using add.

In [7]:
from operator import add
revenuePerOrder=orderItemsMap.reduceByKey(add)

In [8]:
for i in revenuePerOrder.take(10): print(i)

(2, 579.98)
(4, 699.85)
(8, 729.8399999999999)
(10, 651.9200000000001)
(12, 1299.8700000000001)
(14, 549.94)
(16, 419.93)
(18, 449.96000000000004)
(20, 879.8599999999999)
(24, 829.97)


### Transformations follow LazyEvaluation.

#### LazyEvaluation simply uses a DAG(Directed Acyclic Graph) to store all the information related to the Transformations being made.

#### As soon as an Action is 'run', spark executes the DAG first and then the action.

Let's find DAG of transformations:


In [9]:
orderItems.toDebugString()

b'(2) /public/retail_db/order_items MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []\n |  /public/retail_db/order_items HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []'

In [10]:
orderItemsMap.toDebugString()

b'(2) PythonRDD[12] at RDD at PythonRDD.scala:53 []\n |  /public/retail_db/order_items MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []\n |  /public/retail_db/order_items HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []'

In [11]:
revenuePerOrder.toDebugString()

b'(2) PythonRDD[13] at RDD at PythonRDD.scala:53 []\n |  MapPartitionsRDD[10] at mapPartitions at PythonRDD.scala:133 []\n |  ShuffledRDD[9] at partitionBy at NativeMethodAccessorImpl.java:0 []\n +-(2) PairwiseRDD[8] at reduceByKey at <ipython-input-7-409d4c32b590>:2 []\n    |  PythonRDD[7] at reduceByKey at <ipython-input-7-409d4c32b590>:2 []\n    |  /public/retail_db/order_items MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []\n    |  /public/retail_db/order_items HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []'

### Now we can open spark job UI and look into DAG visualization.


### NOTE: DO NOT USE collect() to preview data in REAL LIFE ENVIRONMENT!!!

### Another way tp create an RDD is to open a file using open and making a LIST out of it. (a collection)

In [None]:
productsRaw=open(data_path+"//products").read().splitlines()
type(productsRaw)
productsRaw=sc.parallelize(productsRaw)
type(productsRaw)
print(productsRaw.first())
type(productsRaw.first())
### Now, we can create an RDD using Parallelize from the collection.

## DATA FRAME

#### --provided by sqlContext

#### Now we can try loading multiple file formats.

In [12]:
sqlcontext.read?

##same as spaek.read?

[0;31mType:[0m        property
[0;31mString form:[0m <property object at 0x7fded6f8c818>
[0;31mDocstring:[0m  
Returns a :class:`DataFrameReader` that can be used to read data
in as a :class:`DataFrame`.

:return: :class:`DataFrameReader`

.. versionadded:: 1.4


In [None]:
df = spark.read.json(data_path_json+"//order_items")

In [None]:
df.show(5)

In [None]:
df1=spark.read.format("json").load(data_path_json+"//order_items")

In [None]:
df1.show(5)

## Let's continue with the Orders and OrderItems Example

### Extract OrderStatus: (YYYYMMDD,count)

In [13]:
ordersMap=orders.map(lambda x: (x.split(",")[1].split(" ")[0].replace("-",""),x.split(",")[3]))
ordersMap.first()

('20130725', 'CLOSED')

In [14]:
# To get data of COMPLETE ORDERS OR CLOSED orders
ordersComplete=orders.filter(lambda x: x.split(",")[3] == 'COMPLETE' or x.split(",")[3] == 'CLOSED')
ordersComplete.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE']

In [15]:
#To get CLOSED/COMPLETE orders in 2014-01
ordersComplete=orders.filter(lambda x: (x.split(",")[1][:7]=='2014-01') and (x.split(",")[3] in ['COMPLETE','CLOSED'] ))
ordersComplete.take(5)

['25882,2014-01-01 00:00:00.0,4598,COMPLETE',
 '25888,2014-01-01 00:00:00.0,6735,COMPLETE',
 '25889,2014-01-01 00:00:00.0,10045,COMPLETE',
 '25891,2014-01-01 00:00:00.0,3037,CLOSED',
 '25895,2014-01-01 00:00:00.0,1044,COMPLETE']