In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os

spark = SparkSession.builder.master("local").appName("OutOfMemory").getOrCreate()
sc = spark.sparkContext

In [3]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

"""
Out of Memory Issue
1. It can happen both in driver as well as executor for different reasons
2.  Major 3 reasons can be 
    collect operation on a large dataset(memory overload in driver module) 
        to display a dataset or can store data into variable, if driver memory less and we cannot fit into driver mem
    Broadcast join - splitted across memory and broadcast join on top of it, data get into driver mem for broadcasting
        then OOM
    Native python or R - post operations will be handled in driver, data will be collected to driver mem. It it doesnot
        fit , then OOM occurs

pyspark --driver-memory 1g #limit driver memory before testing code

"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
            .option("inferSchema",True) \
            .option("delimiter",",") \
            .csv(filepath + "IntPersonal_transactions.csv")

In [4]:
df1 = spark.range(1000)
df1.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+
only showing top 20 rows



In [5]:
df1.collect()

[Row(id=0),
 Row(id=1),
 Row(id=2),
 Row(id=3),
 Row(id=4),
 Row(id=5),
 Row(id=6),
 Row(id=7),
 Row(id=8),
 Row(id=9),
 Row(id=10),
 Row(id=11),
 Row(id=12),
 Row(id=13),
 Row(id=14),
 Row(id=15),
 Row(id=16),
 Row(id=17),
 Row(id=18),
 Row(id=19),
 Row(id=20),
 Row(id=21),
 Row(id=22),
 Row(id=23),
 Row(id=24),
 Row(id=25),
 Row(id=26),
 Row(id=27),
 Row(id=28),
 Row(id=29),
 Row(id=30),
 Row(id=31),
 Row(id=32),
 Row(id=33),
 Row(id=34),
 Row(id=35),
 Row(id=36),
 Row(id=37),
 Row(id=38),
 Row(id=39),
 Row(id=40),
 Row(id=41),
 Row(id=42),
 Row(id=43),
 Row(id=44),
 Row(id=45),
 Row(id=46),
 Row(id=47),
 Row(id=48),
 Row(id=49),
 Row(id=50),
 Row(id=51),
 Row(id=52),
 Row(id=53),
 Row(id=54),
 Row(id=55),
 Row(id=56),
 Row(id=57),
 Row(id=58),
 Row(id=59),
 Row(id=60),
 Row(id=61),
 Row(id=62),
 Row(id=63),
 Row(id=64),
 Row(id=65),
 Row(id=66),
 Row(id=67),
 Row(id=68),
 Row(id=69),
 Row(id=70),
 Row(id=71),
 Row(id=72),
 Row(id=73),
 Row(id=74),
 Row(id=75),
 Row(id=76),
 Row(id=7

In [6]:
#increase the limit to 1million and run collect operation, it will throw OOM
#GC overhead limit exceeded