In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, rdd
sc = SparkContext("local[1]", "US-ZIPCODES")

# local[1] will create one partition
# local[*] will use as many cores as possible

In [None]:
# All other imports
import re

----

In [59]:
rdd_us_zipcodes = sc.textFile("hdfs://192.168.93.128:9000/input/all_us_zipcodes.csv")

In [60]:
# How many partitions currently?
print(f"Partitions : {rdd_us_zipcodes.getNumPartitions()}")

Partitions : 1


In [61]:
rdd_header = rdd_us_zipcodes.first()
rdd_zipcodes = rdd_us_zipcodes.filter(lambda l: l != rdd_header).map(lambda l: tuple(l.split(",")))

In [7]:
rdd_zipcodes.take(4)

[('00501', 'Holtsville', 'NY', 'SUFFOLK', '', '40.922326', '-72.637078'),
 ('00544', 'Holtsville', 'NY', 'SUFFOLK', '', '40.922326', '-72.637078'),
 ('01001', 'Agawam', 'MA', 'HAMPDEN', '', '42.140549', '-72.788661'),
 ('01002', 'Amherst', 'MA', 'HAMPSHIRE', '', '42.367092', '-72.464571')]

In [125]:
# A way to scale up/down partitioning/chunking data

rdd_us_zipcodes = rdd_zipcodes.repartition(4)
print(f"Partitions : {rdd_us_zipcodes.getNumPartitions()}")

Partitions : 4


----

In [126]:
def show_data(partition_name):
    """Print partition data"""
    print("----------------------------------------------")
    return(element for i, element in enumerate(partition_name) if i <= 5)

rdd = rdd_us_zipcodes.mapPartitions(show_data)
rdd.collect()

[('01010', 'Brimfield', 'MA', 'HAMPDEN', '', '42.108585', '-72.20448'),
 ('01011', 'Chester', 'MA', 'HAMPDEN', '', '42.294259', '-72.952776'),
 ('01012', 'Chesterfield', 'MA', 'HAMPSHIRE', '', '42.392274', '-72.825607'),
 ('01013', 'Chicopee', 'MA', 'HAMPDEN', '', '42.161492', '-72.667341'),
 ('01014', 'Chicopee', 'MA', 'HAMPDEN', '', '42.170731', '-72.604842'),
 ('01020', 'Chicopee', 'MA', 'HAMPDEN', '', '42.177492', '-72.562563'),
 ('01028', 'East Longmeadow', 'MA', 'HAMPDEN', '', '42.062009', '-72.49874'),
 ('01029', 'East Otis', 'MA', 'BERKSHIRE', '', '42.190904', '-73.051661'),
 ('01030', 'Feeding Hills', 'MA', 'HAMPDEN', '', '42.189335', '-72.79774'),
 ('01031', 'Gilbertville', 'MA', 'WORCESTER', '', '42.352554', '-72.205724'),
 ('01032', 'Goshen', 'MA', 'HAMPSHIRE', '', '42.443837', '-72.819446'),
 ('01033', 'Granby', 'MA', 'HAMPSHIRE', '', '42.262285', '-72.504086'),
 ('01038', 'Hatfield', 'MA', 'HAMPSHIRE', '', '42.387269', '-72.643081'),
 ('01039', 'Haydenville', 'MA', 'HAMPS

In [128]:
rdd.glom().count()  # .glom(): collect data from partitions

4

----

In [172]:
rdd_zipcodes_keyed = rdd_zipcodes.map(lambda x: (x[0], tuple(x)))

In [174]:
rdd_zipcodes_keyed.take(2)

[('00501',
  ('00501', 'Holtsville', 'NY', 'SUFFOLK', '', '40.922326', '-72.637078')),
 ('00544',
  ('00544', 'Holtsville', 'NY', 'SUFFOLK', '', '40.922326', '-72.637078'))]

In [175]:
"""
7 is the modulo by which data partitions are grouped (if this is the only argument that's passed)
The first value in the tuple is hash'd and grouped into its partition (default)
"""

rdd_custom_partition = rdd_zipcodes_keyed.partitionBy(7)


In [184]:
"""
7 is the modulo by which data partitions are grouped (if this is the only argument that's passed)
The first value in the tuple is hash'd and grouped into its partition (default)
Bypass default hash'ing by passing custom hash-partitioning logic as 2nd argument
.partitionBy() uses pyspark.rdd.portable_hash() by default
"""

rdd_custom_partition = rdd_zipcodes_keyed.partitionBy(10, lambda x: hash(x))


In [180]:
rdd_custom_partition.getNumPartitions()

10

In [181]:
rdd_custom_partition

MapPartitionsRDD[250] at mapPartitions at PythonRDD.scala:133

In [186]:
rdd_custom_partition.glom().take(1)

[[('01001',
   ('01001', 'Agawam', 'MA', 'HAMPDEN', '', '42.140549', '-72.788661')),
  ('01008',
   ('01008', 'Blandford', 'MA', 'HAMPDEN', '', '42.177833', '-72.958359')),
  ('01026',
   ('01026', 'Cummington', 'MA', 'HAMPSHIRE', '', '42.428617', '-72.909841')),
  ('01056',
   ('01056', 'Ludlow', 'MA', 'HAMPDEN', '', '42.173276', '-72.627038')),
  ('01066',
   ('01066',
    'North Hatfield',
    'MA',
    'HAMPSHIRE',
    '',
    '42.406697',
    '-72.633901')),
  ('01079',
   ('01079', 'Thorndike', 'MA', 'HAMPDEN', '', '42.192892', '-72.329574')),
  ('01095',
   ('01095', 'Wilbraham', 'MA', 'HAMPDEN', '', '42.125974', '-72.489988')),
  ('01097',
   ('01097', 'Woronoco', 'MA', 'HAMPDEN', '', '42.161743', '-72.845912')),
  ('01129',
   ('01129', 'Springfield', 'MA', 'HAMPDEN', '', '42.124485', '-72.489479')),
  ('01139',
   ('01139', 'Springfield', 'MA', 'HAMPDEN', '', '42.170731', '-72.604842')),
  ('01144',
   ('01144', 'Springfield', 'MA', 'HAMPDEN', '', '42.101796', '-72.59151')),


----

__From Training__

In [81]:
fileRdd = sc.textFile("hdfs://192.168.93.128:9000/input/all_us_zipcodes.csv")

h = fileRdd.first()

# Keyed RDD, where key is the state code, data[2] is a state code NY, CO
fileRdd = (fileRdd.filter(lambda line: line != header)
                 .map(lambda line: line.strip().split(","))
                 .map(lambda data: (data[2], tuple(data)))
          )

def byStateCode(state):
    if (state == "NY"):
        return 1
    #print("**", state)
    return hash(state)

# we need upto 4 partitions
# by default paritionBy uses portable_hash, now we are overwriting that by passing custom partioner

fileRdd = fileRdd.partitionBy(52, byStateCode)    


print("NumPartitions", fileRdd.getNumPartitions())
fileRdd = fileRdd.mapPartitions(showPartData)

k = fileRdd.collect()

[[1.0, 2.0, 3.0, 1.0, 2.0], [4.0, 5.0, 3.5]]

----

Python's `hash()`, PySpark's `pyspark.rdd.portable_hash()`

In [191]:
hash('CA'), hash(None)

(3416303064931459562, 8795728541134)

In [202]:
import os

'PYTHONHASHSEED' in os.environ

True

In [199]:
os.environ['PYTHONHASHSEED'] = '0'

In [200]:
os.environ['PYTHONHASHSEED']

'0'

In [201]:
print(rdd.portable_hash('CA'))

3416303064931459562
