### This notbook helps you get started with plotly on Spark using Python (pyspark) on CloudxLab.
### This is loosely based on https://plot.ly/python/apache-spark/
### It Also requires to signup with plotly. We are not endorsing plotly. We are just helping users to get started in learning


In [57]:
# Define PATH
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2 
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/local/anaconda/bin/python"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.9-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [8]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("appName")
sc = SparkContext(conf=conf)
rdd = sc.textFile("/data/mr/wordcount/input/")
print(rdd.take(10))
sc.version

['The Project Gutenberg EBook of The Adventures of Sherlock Holmes', 'by Sir Arthur Conan Doyle', '(#15 in our series by Sir Arthur Conan Doyle)', '', 'Copyright laws are changing all over the world. Be sure to check the', 'copyright laws for your country before downloading or redistributing', 'this or any other Project Gutenberg eBook.', '', 'This header should be the first thing seen when viewing this Project', 'Gutenberg file.  Please do not remove it.  Do not change or edit the']


'1.6.3'

In [58]:
from __future__ import print_function #python 3 support
print(sc)

<pyspark.context.SparkContext object at 0x7fa0e4f5a208>


In [60]:
# Check the folder for files in hdfs
!hadoop fs -ls /data/spark

Found 23 items
-rw-r--r--   3 hdfs hdfs       5542 2018-08-16 06:43 /data/spark/books.xml
-rw-r--r--   3 hdfs hdfs   41125386 2018-12-14 15:01 /data/spark/btd2.json
-rw-r--r--   3 hdfs hdfs        597 2018-08-16 06:43 /data/spark/episodes.avro
-rw-r--r--   3 hdfs hdfs        240 2018-08-16 06:43 /data/spark/full_user.avsc
drwxr-xr-x   - hdfs hdfs          0 2018-08-16 06:43 /data/spark/graphx
-rw-r--r--   3 hdfs hdfs         72 2018-08-16 06:43 /data/spark/kmeans_data.txt
-rw-r--r--   3 hdfs hdfs       5812 2018-08-16 06:43 /data/spark/kv1.txt
drwxr-xr-x   - hdfs hdfs          0 2018-08-16 06:43 /data/spark/mllib
-rw-r--r--   3 hdfs hdfs     972009 2018-08-16 06:43 /data/spark/mysql-connector-java-5.1.36-bin.jar
drwxr-xr-x   - hdfs hdfs          0 2018-08-16 06:43 /data/spark/pb
-rw-r--r--   3 hdfs hdfs         73 2018-08-16 06:43 /data/spark/people.json
-rw-r--r--   3 hdfs hdfs         32 2018-08-16 06:43 /data/spark/people.txt
drwxr-xr-x   - hdfs hdfs          0 2018-08-

In [61]:
#plotly imports

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
import requests
requests.packages.urllib3.disable_warnings()

In [63]:
# Load data
btd = sqlContext.jsonFile("/data/spark/btd2.json")
print(type(btd))
btd.printSchema()


jsonFile is deprecated. Use read.json() instead.



<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- Bike #: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- End Date: string (nullable = true)
 |-- End Station: string (nullable = true)
 |-- End Terminal: string (nullable = true)
 |-- Start Date: string (nullable = true)
 |-- Start Station: string (nullable = true)
 |-- Start Terminal: string (nullable = true)
 |-- Subscription Type: string (nullable = true)
 |-- Trip ID: string (nullable = true)
 |-- Zip Code: string (nullable = true)



In [65]:
#Check few records
btd.take(3)

[Row(Bike #='520', Duration='63', End Date='8/29/13 14:14', End Station='South Van Ness at Market', End Terminal='66', Start Date='8/29/13 14:13', Start Station='South Van Ness at Market', Start Terminal='66', Subscription Type='Subscriber', Trip ID='4576', Zip Code='94127'),
 Row(Bike #='661', Duration='70', End Date='8/29/13 14:43', End Station='San Jose City Hall', End Terminal='10', Start Date='8/29/13 14:42', Start Station='San Jose City Hall', Start Terminal='10', Subscription Type='Subscriber', Trip ID='4607', Zip Code='95138'),
 Row(Bike #='48', Duration='71', End Date='8/29/13 10:17', End Station='Mountain View City Hall', End Terminal='27', Start Date='8/29/13 10:16', Start Station='Mountain View City Hall', Start Terminal='27', Subscription Type='Subscriber', Trip ID='4130', Zip Code='97214')]

In [66]:
#Register it as table so that we run the query
sqlContext.registerDataFrameAsTable(btd, "bay_area_bike")
df2 = sqlContext.sql("SELECT Duration as d1 from bay_area_bike where Duration < 100")


In [67]:
# Check the counts. Helps in seeing if the data is all well
df2.count()

553

In [68]:
#Initialize the data to be plotted
data = [go.Histogram(x=df2.toPandas()['d1'])]

Not that this requires you to signup  with plotly using https://plot.ly/api_signup
And save the credentials using 

    mkdir ~/.plotly
    nano ~/.plotly/.credentials
    
Copy paste the value that shows up in https://plot.ly/settings/api#/ in this format:

    {
        "username": "<Your User Name>",
        "api_key": "<Your API key>",
        "proxy_username": "",
        "proxy_password": "",
        "stream_ids": []
    }


See https://plot.ly/python/getting-started/#initialization-for-online-plotting


In [53]:
py.iplot(data)

For rest, please follow:
    
    https://plot.ly/python/apache-spark/