# Computer Systems 2016/17

### Practice - Some basic spark syntax

##### SC is the SparkContext, it is only accessible on the master node

In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

sc

<pyspark.context.SparkContext at 0x7fa6a060cb00>

In [2]:
type(sc)

pyspark.context.SparkContext

In [3]:
sc.version

'2.1.0'

#### We may chech which is the default number of nodes and the minimum number of partitions

In [4]:
sc.defaultParallelism

8

In [5]:
sc.defaultMinPartitions

2

##### We can see it contains a lot of extensions

In [6]:
dir(sc)

['PACKAGE_EXTENSIONS',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_accumulatorServer',
 '_active_spark_context',
 '_batchSize',
 '_callsite',
 '_checkpointFile',
 '_conf',
 '_dictToJavaMap',
 '_do_init',
 '_ensure_initialized',
 '_gateway',
 '_getJavaStorageLevel',
 '_initialize_context',
 '_javaAccumulator',
 '_jsc',
 '_jvm',
 '_lock',
 '_next_accum_id',
 '_pickled_broadcast_vars',
 '_python_includes',
 '_temp_dir',
 '_unbatched_serializer',
 'accumulator',
 'addFile',
 'addPyFile',
 'appName',
 'applicationId',
 'binaryFiles',
 'binaryRecords',
 'broadcast',
 'cancelAllJobs',
 'cancelJobGroup',
 'defaultMinPartitions',
 'defaultParalleli

#### When we want to create an RDD from some data (vector, list, dictionary...) we use sc.parallelize

In [7]:
vec = [1,2,3,4,5]

vec_rdd = sc.parallelize(vec)
print ("Data type {}, number of elements {}".format(type(vec_rdd), vec_rdd.count()))

Data type <class 'pyspark.rdd.RDD'>, number of elements 5


#### When we want to transform an RDD back in a local variable, we may use collect()

In [8]:
vec = vec_rdd.collect()
print ("Data type {}, number of elements {}".format(type(vec), len(vec)))

Data type <class 'list'>, number of elements 5


#### When we want to get in a local variable only some elements, we may use take() or other functions for more complex behaviors

In [9]:
vec = vec_rdd.take(2)
print ("Data type {}, number of elements {}".format(type(vec), len(vec)))

Data type <class 'list'>, number of elements 2


#### On RDDs we can perform map/reduce operations

In [10]:
vec_squared_rdd = vec_rdd.map(lambda x: x**2)

vec_squared = vec_squared_rdd.collect()
print (vec_squared)

[1, 4, 9, 16, 25]


In [12]:
sq_sum = vec_squared_rdd.reduce(lambda x,y: x+y)
print (sq_sum)

55


#### And concatenate them

In [15]:
vec = [i for i in range(10000)]
vec_rdd = sc.parallelize(vec)
sq_sum = (vec_rdd
          .map(lambda x: x**2)
          .reduce(lambda x,y: x+y))
print (sq_sum)

333283335000


#### Lambdas ar just function object you can pass around for someone else to use

In [16]:
myFunction = lambda x: x**2

type(myFunction)

function

In [17]:
myFunction(4)

16

In [18]:
def myOtherFunction(x):
    return x**2

type(myOtherFunction)

function

#### We can read a data file directly in RDD using textFile()

In [19]:
f = sc.textFile('2001 A SPACE ODYSSEY.mht')
type(f)

pyspark.rdd.RDD

In [21]:
f2 = open('2001 A SPACE ODYSSEY.mht')
type(f2)

_io.TextIOWrapper

In [22]:
wc = (f.flatMap(lambda x: x.split(' '))
      .map(lambda x: (x,1))
      .reduceByKey(lambda x,y: x+y))

wc.takeOrdered(10, lambda k: -k[1])

[('', 7874),
 ('\t\t\t\t\t', 1969),
 ('the', 585),
 ('of', 290),
 ('to', 279),
 ('and', 254),
 ('THE', 204),
 ('a', 187),
 ('I', 159),
 ('BOWMAN', 140)]

#### And save an RDD using saveAsTextFile

In [23]:
wc.saveAsTextFile('2001 A SPACE ODYSSEY_MODIFIED.mht')

### Data partitions

#### We can take a look on the data partitions, how many they are and how many elements they contain

In [24]:
lines = sc.textFile('2001 A SPACE ODYSSEY.mht')

print ("RDD has {} partitions".format(lines.getNumPartitions()))


RDD has 2 partitions


In [26]:
def countPartitions(id,iterator): 
         c = 0 
         for _ in iterator: 
              c += 1 
         yield (id,c) 
        
print ("The cardinality of the partitions is {}".
       format(lines.mapPartitionsWithIndex(countPartitions).collectAsMap()))

The cardinality of the partitions is {0: 2493, 1: 3203}


#### We can also control the number of partitions with the minPartitions parameter

In [27]:
lines = sc.textFile('2001 A SPACE ODYSSEY.mht', minPartitions=8)

print ("RDD has {} partitions".format(lines.getNumPartitions()))


RDD has 8 partitions


In [29]:
print ("The cardinality of the partitions is {}".
       format(lines.mapPartitionsWithIndex(countPartitions).collectAsMap()))

The cardinality of the partitions is {0: 413, 1: 663, 2: 746, 3: 670, 4: 884, 5: 895, 6: 884, 7: 541}
