In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local[1]", "CACHE")
sc.setLogLevel("WARN")

---

In [3]:
sc.version, sc.pythonVer, sc.appName, sc.applicationId, sc.defaultParallelism, sc.defaultMinPartitions

('2.4.7', '3.7', 'CACHE', 'local-1620413792493', 1, 1)

---

In [4]:
sales = [('CA', 10), ('WA', 20), ('DC', 14), ('TX', 12), ('CA', 13)]

rdd_sales = sc.parallelize(sales)

In [5]:
"Return a list of tuples"
rdd_sales.collect()

[('CA', 10), ('WA', 20), ('DC', 14), ('TX', 12), ('CA', 13)]

In [6]:
"Return a dict"
rdd_sales.collectAsMap()

{'CA': 13, 'WA': 20, 'DC': 14, 'TX': 12}

In [7]:
"RDD empty?"
sc.parallelize([]).isEmpty()

True

In [8]:
"Group by Key: Accumulate values based on Key"
rdd_sales.reduceByKey(lambda acc, v: acc + v).collect()

[('CA', 23), ('WA', 20), ('DC', 14), ('TX', 12)]

---

In [9]:
"""
.groupBy(): is generic, FLEXIBLE [Free to choose what can be a key]
t: tuple

.mapValues(): convert the iterables into actual values. 
"""
rdd_sales.groupBy(lambda t: t[0]).collect()

[('CA', <pyspark.resultiterable.ResultIterable at 0x242c7f69bc8>),
 ('WA', <pyspark.resultiterable.ResultIterable at 0x242c7f69b88>),
 ('DC', <pyspark.resultiterable.ResultIterable at 0x242c7f69d48>),
 ('TX', <pyspark.resultiterable.ResultIterable at 0x242c7f69908>)]

In [10]:
"""
.mapValues()
Return the above Iterables (grouped via .groupBy()) into actual values.
"""
rdd_sales.groupBy(lambda t: t[0]).mapValues(list).collect()

[('CA', [('CA', 10), ('CA', 13)]),
 ('WA', [('WA', 20)]),
 ('DC', [('DC', 14)]),
 ('TX', [('TX', 12)])]

In [11]:
rdd_sales.groupBy(lambda t: "> 13" if t[1] > 13 else "< 13").mapValues(list).collect()

[('< 13', [('CA', 10), ('TX', 12), ('CA', 13)]),
 ('> 13', [('WA', 20), ('DC', 14)])]

In [12]:
".groupByKey() uses the first part of the tuple as key"
rdd_sales.groupByKey().mapValues(list).collect()

[('CA', [10, 13]), ('WA', [20]), ('DC', [14]), ('TX', [12])]

---

In [22]:
rddA = sc.parallelize(range(1, 11))
rddB = sc.parallelize(range(7, 21))
rddC = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 3, 4, 5, 6, 7, 10])

print(rddA.union(rddB).collect())
print(rddA.intersection(rddB).collect())
print(rddA.subtract(rddB).collect())
print(rddC.distinct().collect())
print(rddA.min(), rddA.max(), rddA.mean(), rddA.stdev())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
[8, 10, 7, 9]
[2, 4, 6, 1, 3, 5]
[1, 2, 3, 4, 5, 6, 7, 10]
1 10 5.5 2.8722813232690143


In [26]:
rdd_sales.keys().collect(), rdd_sales.keys().distinct().collect()

(['CA', 'WA', 'DC', 'TX', 'CA'], ['CA', 'WA', 'DC', 'TX'])

In [30]:
"Check jupyter console for print's output"

def show(t): print(t)
rdd_sales.foreach(show)

In [32]:
rdd_sales.count(), rdd_sales.countByKey(), rdd_sales.countByValue()

(5,
 defaultdict(int, {'CA': 2, 'WA': 1, 'DC': 1, 'TX': 1}),
 defaultdict(int,
             {('CA', 10): 1,
              ('WA', 20): 1,
              ('DC', 14): 1,
              ('TX', 12): 1,
              ('CA', 13): 1}))

In [39]:
".sortByKey()"
rdd_sales.sortByKey().collect(), rdd_sales.sortByKey(ascending=False).collect() 

([('CA', 10), ('CA', 13), ('DC', 14), ('TX', 12), ('WA', 20)],
 [('WA', 20), ('TX', 12), ('DC', 14), ('CA', 10), ('CA', 13)])

In [40]:
".sortBy: Custom sort"
rdd_sales.sortBy(lambda t: t[0][-1]).collect()

[('CA', 10), ('WA', 20), ('CA', 13), ('DC', 14), ('TX', 12)]

---

In [None]:
".coalesce() LATER"

---

In [47]:
"Joins"
rdd_statecodes = sc.textFile("hdfs://192.168.93.128:9000/input/all_us_states.csv")
header = rdd_statecodes.first()
rdd_statecodes = rdd_statecodes.filter(lambda s: s != header).map(lambda s: tuple(s.split(",")))

In [48]:
rdd_statecodes.take(2)

[('AL', 'Alabama'), ('AK', 'Alaska')]

In [50]:
rdd_sales.join(rdd_statecodes).collect()

[('CA', (10, 'California')),
 ('CA', (13, 'California')),
 ('DC', (14, 'District of Columbia')),
 ('TX', (12, 'Texas')),
 ('WA', (20, 'Washington'))]

In [53]:
rdd_statecodes.rightOuterJoin(rdd_sales).collect()

[('CA', ('California', 10)),
 ('CA', ('California', 13)),
 ('DC', ('District of Columbia', 14)),
 ('TX', ('Texas', 12)),
 ('WA', ('Washington', 20))]

In [54]:
rdd_statecodes.fullOuterJoin(rdd_sales).collect()

[('AK', ('Alaska', None)),
 ('AR', ('Arkansas', None)),
 ('CA', ('California', 10)),
 ('CA', ('California', 13)),
 ('CO', ('Colorado', None)),
 ('CT', ('Connecticut', None)),
 ('DE', ('Delaware', None)),
 ('DC', ('District of Columbia', 14)),
 ('HI', ('Hawaii', None)),
 ('KS', ('Kansas', None)),
 ('LA', ('Louisiana', None)),
 ('ME', ('Maine', None)),
 ('MO', ('Missouri', None)),
 ('NE', ('Nebraska', None)),
 ('NM', ('New Mexico', None)),
 ('NY', ('New York', None)),
 ('NC', ('North Carolina', None)),
 ('OH', ('Ohio', None)),
 ('RI', ('Rhode Island', None)),
 ('SC', ('South Carolina', None)),
 ('SD', ('South Dakota', None)),
 ('TN', ('Tennessee', None)),
 ('TX', ('Texas', 12)),
 ('VA', ('Virginia', None)),
 ('WV', ('West Virginia', None)),
 ('WI', ('Wisconsin', None)),
 ('WY', ('Wyoming', None)),
 ('AL', ('Alabama', None)),
 ('AZ', ('Arizona', None)),
 ('FL', ('Florida', None)),
 ('GA', ('Georgia', None)),
 ('ID', ('Idaho', None)),
 ('IL', ('Illinois', None)),
 ('IN', ('Indiana', None))