In [1]:
import findspark
findspark.init()
findspark.find()

import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col

In [4]:
spark = SparkSession.builder.appName("ScalaToPySpark").getOrCreate()
sc = spark.sparkContext

In [5]:
file = sc.textFile("file:///C://Users/aksha/Pyspark/Input.log") 

In [6]:
sc.defaultMinPartitions

2

In [7]:
file.getNumPartitions()

2

In [8]:
data = sc.parallelize(range(1,41))

In [9]:
sc.defaultParallelism

16

In [10]:
data.getNumPartitions()

16

In [11]:
repartData = data.repartition(8)

In [12]:
repartData.getNumPartitions()

8

In [13]:
# repartData.saveAsTextFile("file:///C:\\Users\\aksha\\Pyspark\\output")  # Escaped backslashes

In [14]:
def inspect_partitions(rdd):   #Data within the partitioned RDDs
    return (rdd.mapPartitionsWithIndex(lambda idx, it: [(idx, list(it))]).collect())
inspect_partitions(repartData)


[(0, [6, 7, 13, 14, 15]),
 (1, []),
 (2, [1, 2, 3, 4, 5, 26, 27]),
 (3, []),
 (4, [18, 19, 20, 36, 37]),
 (5, [11, 12, 21, 22, 31, 32, 33, 34, 35, 38, 39, 40]),
 (6, [8, 9, 10, 23, 24, 25, 28, 29, 30]),
 (7, [16, 17])]

In [15]:
coadata = data.coalesce(2)

In [16]:
coadata.getNumPartitions()

2

In [17]:
# coadata.saveAsTextFile("file:///C:\\Users\\aksha\\Pyspark\\output")  # Escaped backslashes

In [18]:
def inspect_partitions(rdd):   #Data within the partitioned RDDs
    return (rdd.mapPartitionsWithIndex(lambda idx, it: [(idx, list(it))]).collect())
inspect_partitions(coadata)


[(0, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]),
 (1,
  [21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40])]

In [19]:
newData = sc.parallelize(range(1,41),10)

In [20]:
newData.getNumPartitions()

10

In [21]:
newFile = sc.textFile("file:///C://Users/aksha/Pyspark/Input.log",5)  # Base RDD Level Partitions

In [22]:
newFile.getNumPartitions()

5

In [23]:
data = sc.parallelize(range(1,21))

In [24]:
glomdata = data.glom()

In [25]:
glomdata.collect()

[[1],
 [2],
 [3],
 [4, 5],
 [6],
 [7],
 [8],
 [9, 10],
 [11],
 [12],
 [13],
 [14, 15],
 [16],
 [17],
 [18],
 [19, 20]]

In [26]:
glomfile = newFile.glom()

In [27]:
glomfile.collect()

[['Learning Spark is fun and very powerful',
  'PySpark lets you process big data in memory',
  'DataFrames are optimized for large datasets',
  'Spark transformations are lazy by default',
  'Actions like collect trigger the execution',
  'Map and reduce are the classic RDD tools',
  'Broadcast variables reduce data movement',
  'Shuffles are expensive in distributed systems',
  'You can cache RDDs to reuse in later steps',
  'Spark SQL integrates seamlessly with Hive',
  'Filter and select are DataFrame operations',
  'Window functions enable advanced analytics',
  'Partitioning helps with data distribution',
  'FlatMap can produce multiple outputs per line',
  'GroupBy followed by agg is common pattern',
  'Joins can be expensive if not handled wisely',
  'Cluster managers like YARN help Spark scale',
  'Checkpointing helps in fault-tolerant systems',
  'RDDs are the low-level API in Spark engine',
  'SparkSession is the entry point in PySpark',
  'You can read CSV JSON Parquet Avro

In [28]:
states = sc.parallelize(["KA","MH","TN","TS","DL"],4)

In [29]:
states.getNumPartitions()

4

In [30]:
states.glom().collect()

[['KA'], ['MH'], ['TN'], ['TS', 'DL']]

In [31]:
cities = sc.parallelize(["Ben","Mum","Chn","Hyd"],4)

In [32]:
cities.getNumPartitions()

4

In [33]:
cities.glom().collect()

[['Ben'], ['Mum'], ['Chn'], ['Hyd']]

In [34]:
zipdata = states.zip(cities)

In [35]:
#for idx, partition in zipdata.collect():
 #   print(f"Partition {idx}: {partition}") Since the no elements in both files are not matching this snippet gonna throw error

In [36]:
states1 = sc.parallelize(["KA","MH","TN","TS"],4)

In [37]:
cities1 = sc.parallelize(["Ben","Mum","Chn","Hyd"],4)

In [38]:
states1.getNumPartitions()

4

In [39]:
cities1.getNumPartitions()

4

In [40]:
zipdata1 = states1.zip(cities1)

In [41]:
for idx, partition in zipdata1.collect():
    print(f"partition {idx}: {partition}") # here all the values in both data are matching

partition KA: Ben
partition MH: Mum
partition TN: Chn
partition TS: Hyd


In [42]:
states2 = sc.parallelize(["KA","MH","TN","TS"],4)

In [43]:
cities2 = sc.parallelize(["Ben","Mum","Chn","Hyd"],5)

In [44]:
# zipdata2 = states2.zip(cities2) # Can only zip with RDD which has the same number of partitions

In [46]:
zipIndexData = zipdata1.zipWithIndex()

In [47]:
for i in zipIndexData.collect():
    print(i)

(('KA', 'Ben'), 0)
(('MH', 'Mum'), 1)
(('TN', 'Chn'), 2)
(('TS', 'Hyd'), 3)
