## 1. Spark introduction

In [1]:
import pyspark
pyspark.__version__

'3.0.0'

#### SparkContext

In [2]:
'''SparkContext is to represent the connection to the Spark cluster
and can be used to create a RDD and broadcast variables on that cluster.
Note: Having only one SparkContext at a time
'''
from pyspark import SparkContext
sc = SparkContext()

#### Write a textfile in Jupeter notebook for testing

In [3]:
%%writefile test.txt
The first line
The second line
The third line
The forth line

Overwriting test.txt


#### Create a RDD (Resilient Distributed Dataset) using textFile method

In [4]:
text_rdd = sc.textFile('test.txt')  # text_rdd is a RDD

#### RDD actions

In [5]:
text_rdd.count()  # in this case, count number of line

4

In [6]:
text_rdd.first()  # in this case, get the first line

'The first line'

#### RDD transformations

In [7]:
finding_third = text_rdd.filter(lambda line: 'third' in line)
finding_third

PythonRDD[4] at RDD at PythonRDD.scala:53

``` Note: RDD is lazily evaluated - it does NOT actually execute all instructions 
    of transformations until it performs an action.
    --> Can think of transformation as a recipe```

In [8]:
# Performce an action
finding_third.collect()

['The third line']

In [9]:
# Performce an action
finding_third.count()  

1

## 2. RDD Transformations and Actions

#### map

In [10]:
words = text_rdd.map(lambda line: line.split())
words

PythonRDD[6] at RDD at PythonRDD.scala:53

In [11]:
words.collect()

[['The', 'first', 'line'],
 ['The', 'second', 'line'],
 ['The', 'third', 'line'],
 ['The', 'forth', 'line']]

In [12]:
text_rdd.collect()

['The first line', 'The second line', 'The third line', 'The forth line']

### flatMap

In [13]:
words_flattern = text_rdd.flatMap(lambda line: line.split())
words_flattern

PythonRDD[7] at RDD at PythonRDD.scala:53

In [14]:
words_flattern.collect()

['The',
 'first',
 'line',
 'The',
 'second',
 'line',
 'The',
 'third',
 'line',
 'The',
 'forth',
 'line']

#### Make another text file for testing

In [15]:
%%writefile services.txt
#EventID     Timestamp     Customer    State     ServiceID    Amount
1           08/14/2020      100         CA        123         500.0
2           08/15/2020      202         CA        129         100.0
3           08/16/2020      500         NY        123         2000.0
4           08/17/2020      202         TX        155         600.0
5           08/15/2020      500         NY        166         300.0 

Overwriting services.txt


In [16]:
services = sc.textFile('services.txt')

#### take

In [17]:
services.take(2)

['#EventID     Timestamp     Customer    State     ServiceID    Amount',
 '1           08/14/2020      100         CA        123         500.0']

In [18]:
services.map(lambda line: line.split()).take(3)

[['#EventID', 'Timestamp', 'Customer', 'State', 'ServiceID', 'Amount'],
 ['1', '08/14/2020', '100', 'CA', '123', '500.0'],
 ['2', '08/15/2020', '202', 'CA', '129', '100.0']]

#### Remove # at the beggining of any line

In [19]:
services.map(lambda line: line[1:] if line[0] == '#' else line).collect()

['EventID     Timestamp     Customer    State     ServiceID    Amount',
 '1           08/14/2020      100         CA        123         500.0',
 '2           08/15/2020      202         CA        129         100.0',
 '3           08/16/2020      500         NY        123         2000.0',
 '4           08/17/2020      202         TX        155         600.0',
 '5           08/15/2020      500         NY        166         300.0 ']

In [20]:
clean = services.map(lambda line: line[1:] if line[0] == '#' else line)
clean = clean.map(lambda line: line.split())

In [21]:
clean.collect()

[['EventID', 'Timestamp', 'Customer', 'State', 'ServiceID', 'Amount'],
 ['1', '08/14/2020', '100', 'CA', '123', '500.0'],
 ['2', '08/15/2020', '202', 'CA', '129', '100.0'],
 ['3', '08/16/2020', '500', 'NY', '123', '2000.0'],
 ['4', '08/17/2020', '202', 'TX', '155', '600.0'],
 ['5', '08/15/2020', '500', 'NY', '166', '300.0']]

#### Get data of State and Amount

In [22]:
pairs = clean.map(lambda lst: (lst[3], lst[-1]))  # State column at index 3, Amount column is the last index

In [23]:
pairs.collect()

[('State', 'Amount'),
 ('CA', '500.0'),
 ('CA', '100.0'),
 ('NY', '2000.0'),
 ('TX', '600.0'),
 ('NY', '300.0')]

#### reduceByKey  (similar to groupBy in pandas)

``` reduceByKey(): it is assumed that we have a list of tuples and each tuple has 2 values.
    Furthermore, the first item in a list of tuples is key ```

In [24]:
state_amount = pairs.reduceByKey(lambda amt1, amt2: float(amt1) + float(amt2))  # Note the type in list of tuples

In [25]:
state_amount.collect()

[('State', 'Amount'), ('CA', 600.0), ('NY', 2300.0), ('TX', '600.0')]

#### Remove the ('State', 'Amount') out of above list

In [26]:
state_amount_filter = state_amount.filter(lambda x: not x[0] == 'State')

In [27]:
state_amount_filter.collect()

[('CA', 600.0), ('NY', 2300.0), ('TX', '600.0')]

In [28]:
state_amount_filter

PythonRDD[20] at collect at <ipython-input-27-f310f08d1838>:1

####  For readability: Making tuple unpacking instead of accessing to index

In [29]:
x = ['ID', 'State', 'Amount']

def func1(lst):
    return lst[-1]  # access to last index

In [30]:
def func2(id_state_amount):
    (ID, State, Amount) = id_state_amount  # upacking the values
    return Amount

In [31]:
func1(x)

'Amount'

In [32]:
func2(x)

'Amount'