In [1]:
sc

In [2]:
spark

In [3]:
hr_employee = spark.read.csv("file:///home/hadoop/Downloads/HR_Employee.csv", 
                             inferSchema=True,header=True)

In [4]:
hr_employee.printSchema()

root
 |-- EmployeeID: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- JobInvolvement: string (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobSatisfaction: string (nullable = true)
 |-- Hourlyrate: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Salaryhike: integer (nullable = true)
 |-- OverTime: string (nullable = true)
 |-- Workex: integer (nullable = true)
 |-- YearsSinceLastPromotion: integer (nullable = true)
 |-- EmpSatisfaction: string (nullable = true)
 |-- TrainingTimesLastYear: integer (nullable = true)
 |-- WorkLifeBalance: string (nullable = true)
 |-- Performance_Rating: string (nul

#### 1. Big DataFile Types
    
   * Parquet FileFormat - Records are stored Columunar Format, this file format compreses dataset of .csv of structured format into parquet format. Parquet Format is good for query type of response.
   * There are other file formats as well like - AVRO, ORC Formats,

In [5]:
hr_employee.rdd.getNumPartitions()

1

In [6]:
hr_employee.write.parquet('file:///home/hadoop/Downloads/HR_Parquet')

In [7]:
hr_employee.write.orc("/HR_Orc")

In [9]:
spark.read.orc("/HR_Orc").show(5)

+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|EmployeeID|          Department|             JobRole|Attrition|Gender|Age|MaritalStatus|    Education|EducationField|   BusinessTravel|JobInvolvement|JobLevel|JobSatisfaction|Hourlyrate|Income|Salaryhike|OverTime|Workex|YearsSinceLastPromotion|EmpSatisfaction|TrainingTimesLastYear|WorkLifeBalance|Performance_Rating|
+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|         1|               Sales|     Sales

#### Optimization Techniques
   * Optimizing Spark jobs can significantly improve performance of spark running queris, spark jobs.

2. Partitioning 
    
  * Partitioning divides data into smaller chunks, which can be procesed parallely.

In [10]:
hr_employee.rdd.getNumPartitions()

1

In [11]:
partitionined_df = hr_employee.repartition(3)

In [12]:
partitionined_df.write.parquet("/HR_Partition")

#### 3. Caching & Persistance
  * Managing Different Level of Storage.

In [13]:
# In-Memory Cache Storage.
hr_employee.cache()

DataFrame[EmployeeID: int, Department: string, JobRole: string, Attrition: string, Gender: string, Age: int, MaritalStatus: string, Education: string, EducationField: string, BusinessTravel: string, JobInvolvement: string, JobLevel: int, JobSatisfaction: string, Hourlyrate: int, Income: int, Salaryhike: int, OverTime: string, Workex: int, YearsSinceLastPromotion: int, EmpSatisfaction: string, TrainingTimesLastYear: int, WorkLifeBalance: string, Performance_Rating: string]

In [15]:
# Persistance of DataFrame with specific type of storage options like - memeory only, 
# memory_ser, memory_and_disk
from pyspark import StorageLevel
hr_employee1 = hr_employee.persist(StorageLevel.MEMORY_AND_DISK)

In [16]:
hr_employee2 = hr_employee.persist(StorageLevel.MEMORY_ONLY_SER)

#### 4. Serialization
  
  * Efficient Serialization reduces time to read/write data and transfer it over network.Kyro Serialization is popular serialization method for better performance over default Java Serialization.

a) JavaSerialization: 
* It is default serialization method. Its easy to use but drawback is it will slow down the read, write process. 

In [17]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [18]:
spark = SparkSession.builder.appName("Java Serialization").getOrCreate()

In [19]:
spark.stop()

In [20]:
spark = SparkSession.builder.appName("PySpark Serialization")\
.config("spark.serializer","org.apache.spark.serializer.JavaSerializer").getOrCreate()

In [21]:
spark

b) Kyro Serialization: Faster, more compact than JavaSerialization.

In [23]:
spark = SparkSession.builder\
.config("spark.serializer","org.apache.spark.serializer.KyroSerializer")\
.config("spark.kyro.registationRequired","true")\
.config("spark.kyro.classesToRegister","org.apache.spark.example.Person")\
.appName("Kyro Serialization").getOrCreate()

#### 5. Broadcast Joins
   * Broadcasting small datasets, improve join performance.

In [40]:
small_df = spark.read.csv("file:///home/hadoop/Downloads/airports.csv", inferSchema=True, 
                          header=True)
df = spark.read.csv("/flights/raw_flight_data.csv", inferSchema=True,
                   header=True)

In [41]:
from pyspark.sql.functions import broadcast
broadcast_df = broadcast(small_df)

In [42]:
broadcast_df = broadcast_df.cache()
df = df.cache()

In [44]:
# Broadcast Join
airport_df = df.join(broadcast_df, df.OriginAirportID == broadcast_df.airport_id)

In [45]:
airport_df.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|airport_id|          city|state|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|       Detroit|   MI|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|Salt Lake City|   UT|Salt Lake City In...|
|        19|        5|     DL|          14057|        14869|      -4|     -15|     14057|      Portland|   OR|Portland Internat...|
|        19|        5|     DL|          15016|        11433|      28|      24|     15016|     St. Louis|   MO|Lambert-St. Louis...|
|        19|        5|     DL|          11193|        12892|      -6|     -1

#### 6. Level of Parallelism 

In [46]:
# Adjust level of parallelism based on your cluster size.
spark.conf.set("spark.default.parallellism", 100)

#### 7. Avoid GroupByKey

  * Use ReduceByKey or aggregateByKey() instead of GroupByKey() to reduce number of shuffling.

In [49]:
rdd = spark.sparkContext.parallelize([('dosa',2),('idly',3),('vada', 5),('rice',1),
                                    ('coffee',5),('idly',3),('vada',3)])
rdd.groupByKey().mapValues(sum).collect()

[('dosa', 2), ('idly', 6), ('vada', 8), ('rice', 1), ('coffee', 5)]

In [50]:
rdd.reduceByKey(lambda x, y : x + y).collect()

[('dosa', 2), ('idly', 6), ('vada', 8), ('rice', 1), ('coffee', 5)]

In [60]:
from pyspark.sql.functions import sum

df = spark.createDataFrame([('dosa',2),('idly',3),('vada', 5),('rice',1),('coffee',5),
                            ('idly',3),('vada',3),('sweets',3)], schema=['order','value'])

df.groupBy("order").agg(sum("value").alias("total_value")).show()

+------+-----------+
| order|total_value|
+------+-----------+
|sweets|          3|
|  vada|          8|
|  dosa|          2|
|  idly|          6|
|  rice|          1|
|coffee|          5|
+------+-----------+



In [61]:
df.rdd.reduceByKey(lambda x, y : x + y).collect()

[('sweets', 3),
 ('dosa', 2),
 ('idly', 6),
 ('vada', 8),
 ('rice', 1),
 ('coffee', 5)]

#### 8. Reduce Shuffle
   
   * Reduce the number of shuffles by optimizing transformations.
   * Use reduceByKey() over groupByKey().
   * Use map() and reduce() over groupBy()

#### 9. Repartition() and Coalesce()

#### 10. Accumulators
    * Use accumulators for optimizing aggregate information like count(), sum(). Across all executors parallely executing tasks in multiple worker nodes.
    * Accumulator in spark are variables that can be added through cumulative operations.

In [71]:
# declare and intilaize an accumulator
acc = spark.sparkContext.accumulator(0)

In [63]:
type(acc)

pyspark.accumulators.Accumulator

In [64]:
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [65]:
# Python UDF
def add(x):
    acc.add(x)

In [66]:
rdd.foreach(add)

In [68]:
print(acc.value)

45


In [72]:
def counter(x):
    global acc
    acc.add(1)
    return x 

In [73]:
rdd.map(counter).count()

9

#### 11. Bucketing
    * Use Bucketing to create buckets of large datasets for efficient query and joins.