In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

In [5]:
spark = SparkSession.builder.master("local[4]").appName("SparkSQL").getOrCreate()
print(spark)
print(spark.sparkContext)

<pyspark.sql.session.SparkSession object at 0x000001421837F730>
<SparkContext master=local[4] appName=SparkSQL>


#### Same Spark session is used. Only one SparkSession is created when used getOrCreate()

In [6]:
spark1 = SparkSession.builder.master("local[*]").appName("SparkSQL").getOrCreate()
print(spark1)
print(spark1.sparkContext)

<pyspark.sql.session.SparkSession object at 0x000001421837F730>
<SparkContext master=local[4] appName=SparkSQL>


In [7]:
spark

In [8]:
spark.sparkContext.defaultParallelism

4

In [9]:
data_list = [
    ("India", "USA", '5'), 
    ("India", "China", '7'),
    ("UK", "India", 'three'), 
    ("China", "Africa", '6'),
    ("Japan", "UK", 'Five')
]

In [10]:
df = spark.createDataFrame(data_list).toDF("Source", "Destination", "Shipments")

In [11]:
df.show()

+------+-----------+---------+
|Source|Destination|Shipments|
+------+-----------+---------+
| India|        USA|        5|
| India|      China|        7|
|    UK|      India|    three|
| China|     Africa|        6|
| Japan|         UK|     Five|
+------+-----------+---------+



In [12]:
bad_record_acc = spark.sparkContext.accumulator(0)

In [13]:
def clean_records(shipments: str) -> int:
    data = None
    try:
        data = int(shipments)
    except ValueError:
        bad_record_acc.add(1)
    return data

In [14]:
spark.udf.register("clean_records_udf", clean_records, IntegerType())

<function __main__.clean_records(shipments: str) -> int>

In [15]:
df.withColumn("ShipmentsCount", expr("clean_records_udf(shipments)")).show()

+------+-----------+---------+--------------+
|Source|Destination|Shipments|ShipmentsCount|
+------+-----------+---------+--------------+
| India|        USA|        5|             5|
| India|      China|        7|             7|
|    UK|      India|    three|          null|
| China|     Africa|        6|             6|
| Japan|         UK|     Five|          null|
+------+-----------+---------+--------------+



In [16]:
print(f"Bad Record Count: {bad_record_acc.value}")

Bad Record Count: 2


# Record Count Accumulator

In [17]:
records_acc = spark.sparkContext.accumulator(0)

In [18]:
rdd = df.rdd

In [19]:
rdd.collect()

[Row(Source='India', Destination='USA', Shipments='5'),
 Row(Source='India', Destination='China', Shipments='7'),
 Row(Source='UK', Destination='India', Shipments='three'),
 Row(Source='China', Destination='Africa', Shipments='6'),
 Row(Source='Japan', Destination='UK', Shipments='Five')]

In [20]:
rdd.foreach(lambda x: records_acc.add(1))

In [21]:
print(f"Total Record Count: {records_acc.value}")

Total Record Count: 5


# Total Shipments Count

In [22]:
shipments_acc = spark.sparkContext.accumulator(0)

In [23]:
rdd.collect()[0].Shipments

'5'

In [24]:
def get_count(row):
    shipment = row.Shipments
    try:
        if shipment:
            shipments_acc.add(int(shipment))
    except:
        pass

In [25]:
rdd.foreach(lambda x: get_count(x))

In [26]:
print(f"Total Shipments Count: {shipments_acc.value}")

Total Shipments Count: 18


## When to use accumulators?
    INSIDE ACTION FUNCTIONS
* We have used accumulators inside the foreach(). 
* foreach() is an action and action functions are the right location to use accumulators. 
* foreach() are when to update the state of external variables and systems aka. it causes side effects and foreach() is an appropriate place to update accumulators and more over foreach() is an action function and not a transformation function and hence it is the correct place to manipulate accumulators.

## When NOT to use accumulators?
**Accumulators should not be used inside map() functions doing so can have unintended consequences.**

### Spark can rerun a task in a few instances –
1. When a task encounters an exception, Spark will re-execute the task 4 times by default.
2. If an executor crashes, Spark will re execute the tasks 
3. If a task is running slow, Spark can rerun another copy of the task and this is called speculative execution. It only takes results from the task which completes first.

When a task re-execute, it will execute all the transformation functions in the task and this causes the accumulator value which was already manipulated by the first execution of the task to get manipulated again causing duplication in the accumulator’s result.

Due to this reason, always include code related to accumulator in action functions like foreach(). Spark will not complain at compile time or runtime when you include code related to accumulator in transformation functions like map() so make sure to keep this point in mind when you deal with accumulators.