### Load your AWS Access Key/Secret.

**Never paste your Key/Secret into the notebook. Never print your Key/Secret in the notebook. Never commit your Key/Secret to github.**

In [1]:
import os
myAccessKey = None
mySecretKey = None
def get_secrets():
    global myAccessKey
    global mySecretKey
    myAccessKey = os.environ['AWS_ACCESS_KEY_ID']
    mySecretKey = os.environ['AWS_SECRET_ACCESS_KEY']

get_secrets()

### Set up the Spark Context and AWS S3 credentials

Reference:

[Connecting to PySpark from a Notebook](https://jupyter-docker-stacks.readthedocs.io/en/latest/using/specifics.html#in-a-python-notebook)

[Using PySpark with AWS S3](https://jupyter-docker-stacks.readthedocs.io/en/latest/using/recipes.html#using-pyspark-with-aws-s3)

In [2]:
# this will take some time
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.10.34,org.apache.hadoop:hadoop-aws:2.6.0 pyspark-shell'

import pyspark
sc = pyspark.SparkContext("local[*]")

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

hadoopConf = sc._jsc.hadoopConfiguration()
hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoopConf.set("fs.s3.awsAccessKeyId", myAccessKey)
hadoopConf.set("fs.s3.awsSecretAccessKey", mySecretKey)

### Let's explore some data

In [3]:
user = sqlContext.read.json("s3://matters-analytics-dev/ETL/output/user.json/")
user.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- accountName: string (nullable = true)
 |-- avatar: string (nullable = true)
 |-- baseGravity: long (nullable = true)
 |-- bio: string (nullable = true)
 |-- derivatives: struct (nullable = true)
 |    |-- indexKeywords: string (nullable = true)
 |    |-- lastContributionTime: double (nullable = true)
 |    |-- posts: struct (nullable = true)
 |    |    |-- article: long (nullable = true)
 |    |    |-- comment: long (nullable = true)
 |    |-- serial: long (nullable = true)
 |-- displayName: string (nullable = true)
 |-- email: string (nullable = true)
 |-- everReadPosts: boolean (nullable = true)
 |-- geographicalData: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- planet: string (nullable = true)
 |-- gravity: long (nullable = true)
 |-- groups: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hashType: string (nullable = true)
 |-- id: string 

In [4]:
# registerTime in the data is millisecond in float, convert it to timestamp
from pyspark.sql.types import LongType, TimestampType
def ms2ts(col):
    return (col / 1000).cast(LongType()).cast(TimestampType())

In [5]:
user1 = user.select(user.displayName, user.bio, ms2ts(user.registerTime).alias("registerTime")).cache()

In [6]:
user1.show()

+-----------+-------------------------------------+-------------------+
|displayName|                                  bio|       registerTime|
+-----------+-------------------------------------+-------------------+
|       Andy|                           寫東西的人|2017-11-30 05:07:14|
|      Isaac|                              I,Robot|2017-11-30 05:26:45|
|     Edward|                                 null|2017-11-30 06:27:22|
|      Beryl|                                     |2017-12-01 03:06:47|
|      Yuhan|                                     |2017-12-01 08:31:15|
|       潔平|                                     |2017-12-03 03:22:44|
|       曉雅|                                     |2017-12-04 09:39:16|
|       思聰|                                     |2017-12-04 15:13:11|
|       佳禾|                                     |2017-12-04 15:27:17|
|     方可成|                                     |2017-12-04 15:50:02|
|       映昕|                                     |2017-12-04 15:59:41|
|     黃哲斌|

### Comfortable with Pandas?

In [7]:
import pandas
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)

In [8]:
df = user1.toPandas()

In [9]:
df.head()

Unnamed: 0,displayName,bio,registerTime
0,Andy,寫東西的人,2017-11-30 05:07:14
1,Isaac,"I,Robot",2017-11-30 05:26:45
2,Edward,,2017-11-30 06:27:22
3,Beryl,,2017-12-01 03:06:47
4,Yuhan,,2017-12-01 08:31:15


In [13]:
import numpy as np
data = [go.Histogram(x=df['registerTime'])]

plotly.offline.plot(data, filename='User register time histogram')


Your filename `User register time histogram` didn't end with .html. Adding .html to the end of your file.



'file:///home/jovyan/work/User register time histogram.html'

In [11]:
plotly.offline.iplot(data, filename='User register time histogram')