# PySpark Basic Tutotial

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkContext

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import pyspark.sql.functions as F

In [5]:
sc = SparkContext("local", "First App")

### SparkContext Example firstapp

In [7]:
logFile = "C:///Users/xingy/Spark/spark-3.0.0-preview2-bin-hadoop3.2/README.md"
logData = sc.textFile(logFile).cache()
numAs = logData.filter(lambda s: 'a' in s).count()
numBs = logData.filter(lambda s: 'b' in s).count()
print ("Lines with a: %i, lines with b: %i" % (numAs, numBs))

Lines with a: 65, lines with b: 33


## RDD Operations

### count

In [10]:
words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)
counts = words.count()
print ("Number of elements in RDD -> %i" % (counts))

Number of elements in RDD -> 8


### collect

In [11]:
words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)
coll = words.collect()
print ("Elements in RDD -> %s" % (coll))

Elements in RDD -> ['scala', 'java', 'hadoop', 'spark', 'akka', 'spark vs hadoop', 'pyspark', 'pyspark and spark']


### filter

In [21]:
words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)
words_filter1 = words.filter(lambda x: 'spark' in x)
words_filter2 = words.filter(lambda x: 'spark' not in x)
filtered1 = words_filter1.collect()
filtered2 = words_filter2.collect()
print ("Fitered RDD spark in -> %s" % (filtered1) )
print ("Fitered RDD spark not in -> %s" % (filtered2) )

Fitered RDD spark in -> ['spark', 'spark vs hadoop', 'pyspark', 'pyspark and spark']
Fitered RDD spark not in -> ['scala', 'java', 'hadoop', 'akka']


### map

In [24]:
words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)

words_map = words.map(lambda x: (x, 1))
mapping = words_map.collect()
print ("Key value pair -> %s" % (mapping))

Key value pair -> [('scala', 1), ('java', 1), ('hadoop', 1), ('spark', 1), ('akka', 1), ('spark vs hadoop', 1), ('pyspark', 1), ('pyspark and spark', 1)]


### reduce

In [25]:
from operator import add

nums = sc.parallelize([1, 2, 3, 4, 5])
adding = nums.reduce(add)
print ("Adding all the elements -> %i" % (adding))

Adding all the elements -> 15


### join

In [29]:
x = sc.parallelize([("spark", 1), ("hadoop", 4)])
y = sc.parallelize([("spark", 2), ("hadoop", 5), ('Beam', 18)])

joined = x.join(y)
final = joined.collect()
print ("Join RDD -> %s" % (final))

Join RDD -> [('hadoop', (4, 5)), ('spark', (1, 2))]


### cache

In [30]:
words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
) 
words.cache() 
caching = words.persist().is_cached 
print ("Words got chached > %s" % (caching))

Words got chached > True


## Broadcast & Accumulator

For parallel processing, Apache Spark uses shared variables.<br>

Here are two types of shared variables supported by Apache Spark: *Broadcast*, *Accumulator* <br>


### Broadcast
Broadcast variables are used to save the copy of data across all nodes. This variable is cached on all the machines and not sent on machines with tasks. 

In [33]:
words_new = sc.broadcast(["scala", "java", "hadoop", "spark", "akka"]) 

print(type(words_new))
data = words_new.value 
print ("Stored data -> %s" % (data) )

elem = words_new.value[2] 
print ("Printing a particular element in RDD -> %s" % (elem))

<class 'pyspark.broadcast.Broadcast'>
Stored data -> ['scala', 'java', 'hadoop', 'spark', 'akka']
Printing a particular element in RDD -> hadoop


### Accumulator
Accumulator variables are used for aggregating the information through associative and commutative operations. For example, you can use an accumulator for a sum operation or counters (in MapReduce).

In [40]:
num = sc.accumulator(100) 

def f(x): 
   global num 
   num+=x 

rdd = sc.parallelize([20,30,40,50]) 
rdd.foreach(f) 

final = num.value 
print(type(num))
print ("Accumulated value is -> %i" % (final))

<class 'pyspark.accumulators.Accumulator'>
Accumulated value is -> 240


## SparkFiles

In [39]:
from pyspark import SparkFiles

finddistance = "C:///Users/xingy/Spark/spark-3.0.0-preview2-bin-hadoop3.2/README.md"
finddistancename = "README.md"

sc.addFile(finddistance)
abs_path = SparkFiles.get(finddistancename) 
print ("Absolute Path -> %s" % abs_path)

Absolute Path -> C:\Users\xingy\AppData\Local\Temp\spark-de06aa81-444f-4690-9f72-5a029e68f74f\userFiles-16b7041c-6828-44ba-827d-8eb06892d7b8\README.md


## StorageLevel
DISK_ONLY = StorageLevel(True, False, False, False, 1)

DISK_ONLY_2 = StorageLevel(True, False, False, False, 2)

MEMORY_AND_DISK = StorageLevel(True, True, False, False, 1)

MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2)

MEMORY_AND_DISK_SER = StorageLevel(True, True, False, False, 1)

MEMORY_AND_DISK_SER_2 = StorageLevel(True, True, False, False, 2)

MEMORY_ONLY = StorageLevel(False, True, False, False, 1)

MEMORY_ONLY_2 = StorageLevel(False, True, False, False, 2)

MEMORY_ONLY_SER = StorageLevel(False, True, False, False, 1)

MEMORY_ONLY_SER_2 = StorageLevel(False, True, False, False, 2)

OFF_HEAP = StorageLevel(True, True, True, False, 1)

In [48]:
import pyspark

rdd1 = sc.parallelize([1,2])
rdd1.persist( pyspark.StorageLevel.MEMORY_AND_DISK_2 )
rdd1.getStorageLevel()

rdd2 = sc.parallelize(['a','b', 'c'] )                      
rdd2.persist( pyspark.StorageLevel.DISK_ONLY)
                       
print(rdd1.getStorageLevel())
print(rdd2.getStorageLevel())

Disk Memory Serialized 2x Replicated
Disk Serialized 1x Replicated


## MLlib

### Recommendation
The following example is of collaborative filtering using ALS ( Alternating Least Squares) algorithm to build the recommendation model and evaluate it on training data.

In [49]:
from __future__ import print_function
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

data = sc.textFile("test.data.txt")
ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
   
# Build the recommendation model using Alternating Least Squares
rank = 10
numIterations = 10
model = ALS.train(ratings, rank, numIterations)
   
# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

Mean Squared Error = 1.0770581106910645e-05


## Serializer

In [51]:
sc.stop()
from pyspark.context import SparkContext
from pyspark.serializers import MarshalSerializer
sc = SparkContext("local", "serialization app", serializer = MarshalSerializer())
print(sc.parallelize(list(range(1000))).map(lambda x: 2 * x).take(10))
sc.stop()

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
