## Importing Packages

In [None]:
import findspark

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

## PySpark SQL

In [4]:
findspark.init()

In [5]:
spark = SparkSession.builder.getOrCreate()
df = spark.sql("select 'spark' as hello")
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



### PySpark Context

In [None]:
pyspark.SparkContext(master = None, appName = None, sparkHome = None, pyFiles = None, environment = None, 
                     batchSize = 0, serializer = PickleSerializer(), conf = None, gateway = None, jsc = None, 
                     profiler_cls = <class 'pyspark.profiler.BasicProfiler'>)

In [1]:
sc = SparkContext('local', 'test_1')

In [2]:
sc

In [1]:
x= ['Python', 'is', 'really', 'conveniant']
print(sorted(x))

['Python', 'conveniant', 'is', 'really']


In [6]:
sorted(x, key = lambda arg : arg.lower())

['conveniant', 'is', 'Python', 'really']

## Filtering()

In [7]:
list(filter(lambda arg: len(arg) < 5, x))

['is']

In [13]:
def check_length(x):
    return len(x)<5

l1=[]
for i in x:
    if check_length(i):
        l1.append(i)   
print(l1)

[False, True, False, False]


## Map()

In [11]:
print(list(map(lambda arg : check_length(arg), x)))

[False, True, False, False]


In [14]:
def check_length(x):
    return len(x)<5

l1=[]
for i in x:
    l1.append(check_length(i))   
print(l1)

[False, True, False, False]


## Reduce()

In [28]:
def concat(x,y):
    print( f'{x} + {y} = {x+y}')
    return x + y

In [24]:
from functools import reduce
print(reduce(lambda i,j: concat(i,j), x))

Python
is
Pythonis
really
Pythonisreally
conveniant
Pythonisreallyconveniant


In [22]:
Python + is
Pythonis + really
Pythonisreally + conveniant
Pythonisreallyconveniant

['Python', 'is', 'really', 'conveniant']

In [29]:
print(reduce(lambda i,j: concat(i,j), range(10)))

0 + 1 = 1
1 + 2 = 3
3 + 3 = 6
6 + 4 = 10
10 + 5 = 15
15 + 6 = 21
21 + 7 = 28
28 + 8 = 36
36 + 9 = 45
45


## PySpark

In [1]:
import os
import sys
import pyspark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
sc = pyspark.SparkContext('local[*]')
os.getcwd()

In [10]:
txt = sc.textFile('file:///C:/Users/Devansh/Desktop/Devansh/Big Data/test')
print(txt.count())

python_lines = txt.filter(lambda line: 'spark' in line.lower())
print(python_lines.count())

48
19


In [11]:
sc.stop()

## RDD Resilient Distributed Dataset

In [13]:
sc = pyspark.SparkContext('local[*]')

words = sc.parallelize(['scala', 'mahout', 'solaris', 'vertica', 'reddis', 'hadoop', 'lunaris'])
words.count()
words.collect()

def test(x):
    print(x)

test_result = words.foreach(test)
sc.stop()

In [29]:
sc = pyspark.SparkContext('local[*]', 'cache check')
words = sc.parallelize(['scala', 'mahout', 'solaris', 'vertica', 'reddis', 'hadoop', 'lunaris'])

In [30]:
words.persist().is_cached

True

In [31]:
words.cache()

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

## Shared Variables

* Broadcast
* Accumulator

In [32]:
words_2 = sc.broadcast(['scala', 'mahout', 'solaris', 'vertica', 'reddis', 'hadoop', 'lunaris'])
data = words_2.value
print(f'Stored data is {data}')

Stored data is ['scala', 'mahout', 'solaris', 'vertica', 'reddis', 'hadoop', 'lunaris']


In [33]:
words_2.value[2]

'solaris'

In [34]:
y = sc.accumulator(100)
def f(x):
    global y
    y+=x
    
rdd = sc.parallelize([10,20,30,40,50])
rdd.foreach(f)

final = y.value
print(final)

250


In [35]:
sc.stop()

## Spark mllib

* classification
* regression
* clustering
* linalg
* recommendation
* fpm