In [1]:
import findspark
findspark.init()
findspark.find()

import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from pyspark import SparkContext

In [4]:
spark = SparkSession.builder.appName("ScalaToPySpark").getOrCreate()
sc = spark.sparkContext

In [5]:
file = sc.textFile("file:///C://Users/aksha/Pyspark/Input.log") 

In [6]:
sc.defaultMinPartitions

2

In [7]:
file.getNumPartitions()

2

In [8]:
data = sc.parallelize(range(1,41))

In [9]:
sc.defaultParallelism

16

In [10]:
data.getNumPartitions()

16

In [11]:
repartData = data.repartition(8)

In [12]:
repartData.getNumPartitions()

8

In [13]:
# repartData.saveAsTextFile("file:///C:\\Users\\aksha\\Pyspark\\output")  # Escaped backslashes

In [14]:
for idx, part in repartData.mapPartitionsWithIndex(lambda idx, it: [(idx, list(it))]).collect():
    print(f"Partition {idx}: {part}")


Partition 0: [6, 7, 13, 14, 15]
Partition 1: []
Partition 2: [1, 2, 3, 4, 5, 26, 27]
Partition 3: []
Partition 4: [18, 19, 20, 36, 37]
Partition 5: [11, 12, 21, 22, 31, 32, 33, 34, 35, 38, 39, 40]
Partition 6: [8, 9, 10, 23, 24, 25, 28, 29, 30]
Partition 7: [16, 17]


In [15]:
coadata = data.coalesce(4)

In [16]:
coadata.getNumPartitions()

4

In [17]:
# coadata.saveAsTextFile("file:///C:\\Users\\aksha\\Pyspark\\output")  # Escaped backslashes

In [18]:
for idx, part in coadata.mapPartitionsWithIndex(lambda idx, it: [(idx, list(it))]).collect():
    print(f"Partition {idx}: {part}")


Partition 0: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Partition 1: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Partition 2: [21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
Partition 3: [31, 32, 33, 34, 35, 36, 37, 38, 39, 40]


In [19]:
newData = sc.parallelize(range(1,41),10)

In [20]:
newData.getNumPartitions()

10

In [21]:
newFile = sc.textFile("file:///C://Users/aksha/Pyspark/Input.log",5)  # Base RDD Level Partitions

In [22]:
newFile.getNumPartitions()

5

In [23]:
data = sc.parallelize(range(1,21))

In [24]:
glomdata = data.glom()

In [25]:
glomdata.collect()

[[1],
 [2],
 [3],
 [4, 5],
 [6],
 [7],
 [8],
 [9, 10],
 [11],
 [12],
 [13],
 [14, 15],
 [16],
 [17],
 [18],
 [19, 20]]

In [26]:
glomfile = newFile.glom()

In [27]:
glomfile.collect()

[['Learning Spark is fun and very powerful',
  'PySpark lets you process big data in memory',
  'DataFrames are optimized for large datasets',
  'Spark transformations are lazy by default',
  'Actions like collect trigger the execution',
  'Map and reduce are the classic RDD tools',
  'Broadcast variables reduce data movement',
  'Shuffles are expensive in distributed systems',
  'You can cache RDDs to reuse in later steps',
  'Spark SQL integrates seamlessly with Hive',
  'Filter and select are DataFrame operations',
  'Window functions enable advanced analytics',
  'Partitioning helps with data distribution',
  'FlatMap can produce multiple outputs per line',
  'GroupBy followed by agg is common pattern',
  'Joins can be expensive if not handled wisely',
  'Cluster managers like YARN help Spark scale',
  'Checkpointing helps in fault-tolerant systems',
  'RDDs are the low-level API in Spark engine',
  'SparkSession is the entry point in PySpark',
  'You can read CSV JSON Parquet Avro

In [28]:
states = sc.parallelize(["KA","MH","TN","TS","DL"],4)

In [29]:
states.getNumPartitions()

4

In [30]:
states.glom().collect()

[['KA'], ['MH'], ['TN'], ['TS', 'DL']]

In [31]:
cities = sc.parallelize(["Ben","Mum","Chn","Hyd"],4)

In [32]:
cities.getNumPartitions()

4

In [33]:
cities.glom().collect()

[['Ben'], ['Mum'], ['Chn'], ['Hyd']]

In [34]:
zipdata = states.zip(cities)

In [35]:
#for idx, partition in zipdata.collect():
 #   print(f"Partition {idx}: {partition}") Since the no elements in both files are not matching this snippet gonna throw error

In [36]:
states1 = sc.parallelize(["KA","MH","TN","TS"],4)

In [37]:
cities1 = sc.parallelize(["Ben","Mum","Chn","Hyd"],4)

In [38]:
states1.getNumPartitions()

4

In [39]:
cities1.getNumPartitions()

4

In [40]:
zipdata1 = states1.zip(cities1)

In [41]:
for idx, partition in zipdata1.collect():
    print(f"partition {idx}: {partition}") # here all the values in both data are matching

partition KA: Ben
partition MH: Mum
partition TN: Chn
partition TS: Hyd


In [42]:
states2 = sc.parallelize(["KA","MH","TN","TS"],4)

In [43]:
cities2 = sc.parallelize(["Ben","Mum","Chn","Hyd"],5)

In [44]:
# zipdata2 = states2.zip(cities2) # Can only zip with RDD which has the same number of partitions

In [45]:
zipIndexData = zipdata1.zipWithIndex()

In [46]:
for i in zipIndexData.collect():
    print(i)

(('KA', 'Ben'), 0)
(('MH', 'Mum'), 1)
(('TN', 'Chn'), 2)
(('TS', 'Hyd'), 3)


In [47]:
data = sc.parallelize([90,20,10,20,10,90,90,90,90,90,90,10,30,45,55,87,98,34,54,10])

In [48]:
data.count()

20

In [49]:
distdata = data.distinct()

In [50]:
distdata.count()

10

In [51]:
distdata.collect()

[98, 34, 20, 54, 55, 87, 90, 10, 45, 30]

In [52]:
datanew = sc.parallelize(["Spark","Scala","Python","Spark","spark","Scala","Spark","Python","Spark","SCala"])

In [53]:
distdata = datanew.distinct()

In [54]:
distdata.count()

5

In [55]:
distdata.collect()

['spark', 'SCala', 'Spark', 'Scala', 'Python']

In [56]:
grouped = datanew.map(lambda a: (a, 1)).groupByKey()


In [57]:
for key, values in grouped.collect():
    print(f"{key}: {list(values)}")

spark: [1]
SCala: [1]
Spark: [1, 1, 1, 1]
Scala: [1, 1]
Python: [1, 1]


In [58]:
for key, count in grouped.collect():
    print(f"{key}: {len(count)}")

spark: 1
SCala: 1
Spark: 4
Scala: 2
Python: 2


In [59]:
#Using functions within transformation
def gkfun1(p):
    return p * p * p

In [60]:
data = sc.parallelize([3, 6, 1, 3, 9, 10, 14, 11, 12, 10])

In [61]:
mapdata = data.map(gkfun1)

In [62]:
mapdata.collect()

[27, 216, 1, 27, 729, 1000, 2744, 1331, 1728, 1000]

In [63]:
def gkfun2(x):
    return "Hello" + " " + x.upper()

In [64]:
data = sc.parallelize(["Spark","sCala","python","hadOOp","RubyOnRails"])

In [65]:
mapdata = data.map(gkfun2)

In [66]:
mapdata.collect()

['Hello SPARK',
 'Hello SCALA',
 'Hello PYTHON',
 'Hello HADOOP',
 'Hello RUBYONRAILS']

In [67]:
inputdata = sc.parallelize([1,2,3,4,5,6,7,8],2)

In [68]:
inputdata.glom().collect()

[[1, 2, 3, 4], [5, 6, 7, 8]]

In [69]:
def sum(x):
    sum1 = 1
    return (sum1 + x)

In [70]:
newInputdata = inputdata.map(sum) # funtion for map transformation

In [71]:
newInputdata.glom().collect()

[[2, 3, 4, 5], [6, 7, 8, 9]]

In [72]:
def sum_part(iter):
    total = 1
    for x in iter:
        total += x
    return [total]

In [73]:
file = sc.parallelize([1,3,5,7,9,11,13,15,17,19],2)

In [74]:
newdata = file.mapPartitions(sum_part)

In [75]:
newdata.collect()

[26, 76]

In [76]:
# Step 1: Create the key-value RDD
data = sc.parallelize([("Spark", 40),("Scala", 60),("Hadoop", 20),("C++", 40),("C", 50),("Java", 30),("Python", 50),("RubyOnRails", 40),("VC++", 70),
    ("Cobol", 10)])

In [77]:
# Step 2: Extract keys only
keydata = data.keys()

In [78]:
for key in keydata.collect():
    print(key)

Spark
Scala
Hadoop
C++
C
Java
Python
RubyOnRails
VC++
Cobol


In [79]:
valuedata = data.values()

In [80]:
for value in valuedata.collect():
    print(value)

40
60
20
40
50
30
50
40
70
10


In [81]:
data = sc.parallelize([34, 12, 10, 41, 23, 56, 77, 72, 10, 98, 87, 57, 15, 32, 67,89, 99, 20, 30, 40, 50, 60, 80, 90, 97, 81, 38, 39, 26])


In [82]:
data.count()

29

In [83]:
data.take(4) # First 4 elements

[34, 12, 10, 41]

In [84]:
data.top(4) # Top 4 elements

[99, 98, 97, 90]

In [85]:
data.takeOrdered(4) # Below 4 elements that means descending order that too least 4 elements

[10, 10, 12, 15]

In [86]:
data.distinct().takeOrdered(4) # Below 4 elements that means descending order that too least 4 distinct elements

[10, 12, 15, 20]

In [87]:
data.max()

99

In [88]:
data.min()

10

In [89]:
data.mean()

52.75862068965517

In [90]:
data.stdev()

np.float64(28.906096687095438)

In [91]:
data.sum()

1530

In [92]:
data.reduce(lambda a, b: a + b)

1530

In [96]:
#data3 = sc.parallelize(["Spark", "Scala", "Java", "C++", "Python", "Cobot"])

In [97]:
#data4 = sc.parallelize(["JavaScript", "Hadoop", "Scala", "Java", "Informatica", "DataStage"])

In [98]:
#data3.join(data4).collect()

[('J', ('a', 'a')), ('J', ('a', 'a')), ('S', ('p', 'c')), ('S', ('c', 'c'))]