In [3]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

In [4]:
import numpy as np

# NumPy array creation

In [5]:
# To create a NumPy array, you can use the function np.array()
a = np.array([2, 1, 2, 1])
print(a)

# Create an array filled with 0’s
b = np.zeros(3)
print(b)

# Array filled with 1’s:
c = np.ones(3)
print(c)

# Array with a range of elements
d = np.arange(4)
print(d)

[2 1 2 1]
[0. 0. 0.]
[1. 1. 1.]
[0 1 2 3]


# Basic array operations

In [6]:
# Addition of two vectors
a+d

array([2, 2, 4, 4])

In [7]:
# Substraction of two vectors
a-d

array([ 2,  0,  0, -2])

In [8]:
# Dot product of the vectors
a*d

array([0, 1, 4, 3])

In [9]:
# Sum of all elements
a.sum()

6

In [10]:
# The min element
a.min()

1

In [11]:
# Multiplay each element by a fixed number
a*5

array([10,  5, 10,  5])

In [12]:
# Add a fixed number to each element
a+3

array([5, 4, 5, 4])

In [13]:
# Divide each element by a fixed number
a/2

array([1. , 0.5, 1. , 0.5])

# Working with mathematical formulas

In [14]:
predictions = np.array([2.2, 3.3, 3.9, 5.1, 5.3])
labels = np.array([2, 3, 4, 5, 6])
n = len(predictions)

To calculate the following formula
$$
 error = \frac{1}{n}\sum(labels-predictions)^2
$$
We can use the following NumPy code

In [15]:
error = (1/n)*np.sum(np.square(labels-predictions))
print(error)

0.12800000000000003


# Spark and NumPY

In [27]:
a = np.array([[1,2,3],[4,5,6]])
rdd = sc.parallelize(a)
print(rdd.collect())

[array([1, 2, 3]), array([4, 5, 6])]


In [28]:
# Multiplay each vector in the RDD wiht a fixed number
rdd.map(lambda x: x* 2).collect()

[array([2, 4, 6]), array([ 8, 10, 12])]

## Key and numpy array in RDD
Examples of RDD that have a key and a numpy array as a parameter

In [32]:
a = np.arange(30).reshape(6,5)
keys = np.arange(6)
print(a)

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]
 [25 26 27 28 29]]


In [33]:
rdd = sc.parallelize(zip(keys,a))
rdd.collect()

[(0, array([0, 1, 2, 3, 4])),
 (1, array([5, 6, 7, 8, 9])),
 (2, array([10, 11, 12, 13, 14])),
 (3, array([15, 16, 17, 18, 19])),
 (4, array([20, 21, 22, 23, 24])),
 (5, array([25, 26, 27, 28, 29]))]

In [34]:
# Operatinos on NumPy arrays stored in RDD
rdd.map(lambda x: (x[0], x[1]*2)).collect()

[(0, array([0, 2, 4, 6, 8])),
 (1, array([10, 12, 14, 16, 18])),
 (2, array([20, 22, 24, 26, 28])),
 (3, array([30, 32, 34, 36, 38])),
 (4, array([40, 42, 44, 46, 48])),
 (5, array([50, 52, 54, 56, 58]))]

In [35]:
# Dot product
params = np.array([0.5,2,0.5,2,0.5])
rdd.map(lambda x: (x[0], x[1]*params)).collect()

[(0, array([0., 2., 1., 6., 2.])),
 (1, array([ 2.5, 12. ,  3.5, 16. ,  4.5])),
 (2, array([ 5., 22.,  6., 26.,  7.])),
 (3, array([ 7.5, 32. ,  8.5, 36. ,  9.5])),
 (4, array([10., 42., 11., 46., 12.])),
 (5, array([12.5, 52. , 13.5, 56. , 14.5]))]

# More complex calculations

Calculation of the following example in PySpark
$$
 \theta = \sum_{i=1}^n(x_i*\beta)
$$
where X is an array that contains vectors xi, beta is a vector wiht some parameters and the result is vector theta

In [36]:
x = np.arange(30).reshape(6,5)
rdd = sc.parallelize(x)
beta = np.array([0.5, 1, 0.3, 0.2, 2])
theta = rdd.map(lambda x: x*beta).reduce(lambda x, y: x+y)
print(theta)

[ 37.5  81.   26.1  18.6 198. ]
