# 載入 sqlContext 函式庫

In [1]:
from pyspark.sql import SQLContext, SparkSession
from pyspark import SparkContext, SparkConf

sparkConf = SparkConf().setMaster("local") \
            .setAppName("read-mongodb-2") 
            
sc = SparkContext(conf=sparkConf)
sqlContext = SQLContext(sc)

# 設定遠端 Mongodb 連線資訊

In [2]:
df = sqlContext.read.format("com.mongodb.spark.sql.DefaultSource")\
                    .option("spark.mongodb.input.uri", "mongodb://mongodb:27017/ltu_demo.data_2019063020").load()

# 打印 SQLContext Dataframe 第一筆資料內容

In [3]:
df.first()

Row(Humidity=66, Temperature=27.0, _id=Row(oid='5d1b8d40be567be7cc90353b'), device_id='001', timestamp='2019-07-03 00:37:12')

# 將 Dataframe 轉換為 Spark rdd 格式

In [4]:
rdd = df.rdd 

# 打印 Spark rdd 第一筆資料內容
## 後續接 Spark RDD 課程

In [5]:
rdd.first()

Row(Humidity=66, Temperature=27.0, _id=Row(oid='5d1b8d40be567be7cc90353b'), device_id='001', timestamp='2019-07-03 00:37:12')

# 抓出 temperature = 27

In [7]:
df[df["temperature"]==27].show()

+--------+-----------+--------------------+---------+-------------------+
|Humidity|Temperature|                 _id|device_id|          timestamp|
+--------+-----------+--------------------+---------+-------------------+
|      66|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:37:12|
|      66|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:41:30|
|      64|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:42:47|
|      64|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:44:49|
|      64|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:45:10|
|      63|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:45:51|
|      64|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:46:43|
|      65|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:47:24|
|      65|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:47:35|
|      67|       27.0|[5d1b8d40be567be7...|      001|2019-07-03 00:48:42|
|      67|       27.0|[5d1b8d40be567be

# 將 Dataframe 轉換成 pandas Dataframe格式

In [8]:
pandas_df=df.toPandas()

# 增加一筆資料

In [9]:
pandas_df["operator"]="Andy"

# 打印 pandas_df 資料內容

In [10]:
pandas_df

Unnamed: 0,Humidity,Temperature,_id,device_id,timestamp,operator
0,66,27.0,"(5d1b8d40be567be7cc90353b,)",001,2019-07-03 00:37:12,Andy
1,66,27.0,"(5d1b8d40be567be7cc90353c,)",001,2019-07-03 00:41:30,Andy
2,64,27.0,"(5d1b8d40be567be7cc90353d,)",001,2019-07-03 00:42:47,Andy
3,64,27.0,"(5d1b8d40be567be7cc90353e,)",001,2019-07-03 00:44:49,Andy
4,64,27.0,"(5d1b8d40be567be7cc90353f,)",001,2019-07-03 00:45:10,Andy
5,63,27.0,"(5d1b8d40be567be7cc903540,)",001,2019-07-03 00:45:51,Andy
6,64,27.0,"(5d1b8d40be567be7cc903541,)",001,2019-07-03 00:46:43,Andy
7,65,27.0,"(5d1b8d40be567be7cc903542,)",001,2019-07-03 00:47:24,Andy
8,65,27.0,"(5d1b8d40be567be7cc903543,)",001,2019-07-03 00:47:35,Andy
9,67,27.0,"(5d1b8d40be567be7cc903544,)",001,2019-07-03 00:48:42,Andy


In [11]:
sc.stop()