In [1]:
import os
import pyspark
import math 
from datetime import datetime
from pyspark.sql import SQLContext, Row
from pyspark import SparkContext
from pyspark.sql import functions as F

sc = SparkContext.getOrCreate()
sqlContext = pyspark.sql.SQLContext(sc)

In [2]:
sc

## 1) init RDD 

In [3]:
# init RDDs 
intRDD = sc.parallelize([6,7,1,2,0])
intRDD2 = sc.parallelize(["apple", "car", "pan"])
intRDD3 = sc.parallelize([{"foo": 1, "bar": 2}, {"foo": 3, "baz": -1, "bar": 5}])
intRDD4 = sc.parallelize([{"foo": 1, "bar": 2, "zz": "{'a':1 , 'b':2}"}, 
                          {"foo": 3, "baz": -1, "bar": 5, "zz": "{'a':1 , 'b':2}"}])

In [4]:
# check the rdd 
intRDD.collect()

[6, 7, 1, 2, 0]

In [5]:
intRDD2.collect()

['apple', 'car', 'pan']

In [6]:
intRDD3.collect()

[{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': -1, 'bar': 5}]

In [7]:
intRDD4.collect()

[{'foo': 1, 'bar': 2, 'zz': "{'a':1 , 'b':2}"},
 {'foo': 3, 'baz': -1, 'bar': 5, 'zz': "{'a':1 , 'b':2}"}]

## 2) RDD op demo

In [8]:
intRDD3.map(lambda x : [x['foo']]).collect()

[[1], [3]]

In [9]:
intRDD3.flatMap(lambda x : [x['foo']]).collect()

[1, 3]

In [10]:
intRDD3.flatMap(lambda x : [x['foo'], x['bar']]).collect()

[1, 2, 3, 5]

In [11]:
intRDD4.map(lambda x : [x['foo'], x['bar'], x['zz']]).collect()

[[1, 2, "{'a':1 , 'b':2}"], [3, 5, "{'a':1 , 'b':2}"]]

## 3) python str to dict 

In [12]:
# string to dict 
import json
s = "{'muffin' : 'lolz', 'foo' : 'kitty'}"
json_acceptable_string = s.replace("'", "\"")
d = json.loads(json_acceptable_string)
d

{'muffin': 'lolz', 'foo': 'kitty'}

## 4) Map UDF to RDD

In [13]:
def str_2_dict(s):
    import json 
    json_acceptable_string = s.replace("'", "\"")
    return json.loads(json_acceptable_string)
   
    
result1 = (intRDD4.map(lambda x : [x['foo'], x['bar'], x['zz']])
       .map(lambda x : x[2]))

result2 = (intRDD4.map(lambda x : [x['foo'], x['bar'], x['zz']])
       .map(lambda x : x[2])
       .map(str_2_dict))

In [14]:
# STR IN RDD 
result1.collect()

["{'a':1 , 'b':2}", "{'a':1 , 'b':2}"]

In [15]:
# DICT IN RDD 
result2.collect()

[{'a': 1, 'b': 2}, {'a': 1, 'b': 2}]